diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index c51cd4ab..16303c9f 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -57,9 +57,9 @@ jobs: pip install -r requirements.txt python -c 'import polars_ds as pds' python -c 'from polars_ds import linear_models' - python -c 'from polars_ds.ts_features import *' python -c 'from polars_ds.spatial import *' python -c 'from polars_ds.sample_and_split import *' + python -c 'from polars_ds.exprs.ts_features import *' - name: Upload sdist uses: actions/upload-artifact@v4 @@ -81,7 +81,8 @@ jobs: pip install jupyter ipython ipykernel nbconvert pip install -r tests/requirements-test.txt jupyter execute examples/basics.ipynb - jupyter execute examples/diagnosis.ipynb + jupyter execute examples/pipeline.ipynb + jupyter execute examples/eda.ipynb jupyter execute examples/sample_and_split.ipynb diff --git a/Cargo.lock b/Cargo.lock index 032a6c4c..afabdef5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -56,9 +56,9 @@ dependencies = [ [[package]] name = "allocator-api2" -version = "0.2.18" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" [[package]] name = "android-tzdata" @@ -130,7 +130,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.93", ] [[package]] @@ -141,7 +141,7 @@ checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.93", ] [[package]] @@ -209,9 +209,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.5.4" +version = "1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d82033247fd8e890df8f740e407ad4d038debb9eb1f40533fffb32e7d17dc6f7" +checksum = "b8ee0c1824c4dea5b5f81736aff91bae041d2c07ee1192bec91054e10e3e601e" dependencies = [ "arrayref", "arrayvec", @@ -258,22 +258,22 @@ checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" [[package]] name = "bytemuck" -version = "1.19.0" +version = "1.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d" +checksum = "ef657dfab802224e671f5818e9a4935f9b1957ed18e58292690cc39e7a4092a3" dependencies = [ "bytemuck_derive", ] [[package]] name = "bytemuck_derive" -version = "1.8.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcfcc3cd946cb52f0bbfdbbcfa2f4e24f75ebb6c0e1002f7c25904fada18b9ec" +checksum = "3fa76293b4f7bb636ab88fd78228235b5248b4d05cc589aed610f954af5d7c7a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.93", ] [[package]] @@ -284,9 +284,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" +checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" [[package]] name = "castaway" @@ -299,9 +299,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.31" +version = "1.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2e7962b54006dcfcc61cb72735f4d89bb97061dd6a7ed882ec6b8ee53714c6f" +checksum = "8d6dbb628b8f8555f86d0323c2eb39e3ec81901f4b83e091db8a6a76d316a333" dependencies = [ "jobserver", "libc", @@ -328,9 +328,9 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.38" +version = "0.4.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" +checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825" dependencies = [ "android-tzdata", "iana-time-zone", @@ -396,9 +396,9 @@ checksum = "7e8f1e641542c07631228b1e0dc04b69ae3c1d58ef65d5691a439711d805c698" [[package]] name = "compact_str" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6050c3a16ddab2e412160b31f2c871015704239bca62f72f6e5f0be631d3f644" +checksum = "3b79c4069c6cad78e2e0cdfcbd26275770669fb39fd308a752dc110e83b9af32" dependencies = [ "castaway", "cfg-if", @@ -417,9 +417,9 @@ checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" [[package]] name = "core-foundation" -version = "0.9.4" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +checksum = "b55271e5c8c478ad3f38ad24ef34923091e0548492a266d19b3c0b4d82574c63" dependencies = [ "core-foundation-sys", "libc", @@ -433,9 +433,9 @@ checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] name = "cpufeatures" -version = "0.2.14" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "608697df725056feaccfa42cffdaeeec3fccc4ffc38358ecd19b243e716a78e0" +checksum = "16b80225097f2e5ae4e7179dd2266824648f3e2f49d9134d584b76389d31c4c3" dependencies = [ "libc", ] @@ -451,18 +451,18 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.13" +version = "0.5.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33480d6946193aa8033910124896ca395333cae7e2d1113d1fef6c3272217df2" +checksum = "06ba6d68e24814cb8de6bb986db8222d3a027d15872cabc0d18817bc3c0e4471" dependencies = [ "crossbeam-utils", ] [[package]] name = "crossbeam-deque" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" dependencies = [ "crossbeam-epoch", "crossbeam-utils", @@ -479,18 +479,18 @@ dependencies = [ [[package]] name = "crossbeam-queue" -version = "0.3.11" +version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df0346b5d5e76ac2fe4e327c5fd1118d6be7c51dfb18f9b7922923f287471e35" +checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115" dependencies = [ "crossbeam-utils", ] [[package]] name = "crossbeam-utils" -version = "0.8.20" +version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crunchy" @@ -524,6 +524,17 @@ dependencies = [ "crypto-common", ] +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.93", +] + [[package]] name = "doc-comment" version = "0.3.3" @@ -564,7 +575,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.93", ] [[package]] @@ -576,7 +587,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.93", ] [[package]] @@ -590,11 +601,11 @@ dependencies = [ [[package]] name = "equator" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5099e7b6f0b7431c7a1c49f75929e2777693da192784f167066977a2965767af" +checksum = "4711b213838dfee0117e3be6ac926007d7f433d7bbe33595975d4190cb07e6fc" dependencies = [ - "equator-macro 0.4.1", + "equator-macro 0.4.2", ] [[package]] @@ -605,18 +616,18 @@ checksum = "3bf679796c0322556351f287a51b49e48f7c4986e727b5dd78c972d30e2e16cc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.93", ] [[package]] name = "equator-macro" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5322a90066ddae2b705096eb9e10c465c0498ae93bf9bdd6437415327c88e3bb" +checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.93", ] [[package]] @@ -627,12 +638,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.9" +version = "0.3.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -651,7 +662,7 @@ dependencies = [ "coe-rs", "dbgf", "dyn-stack", - "equator 0.4.1", + "equator 0.4.2", "faer-entity", "gemm", "libm", @@ -709,9 +720,9 @@ checksum = "95765f67b4b18863968b4a1bd5bb576f732b29a4a28c7cd84c09fa3e2875f33c" [[package]] name = "flate2" -version = "1.0.34" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0" +checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" dependencies = [ "crc32fast", "miniz_oxide", @@ -734,9 +745,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] name = "foldhash" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2" +checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f" [[package]] name = "form_urlencoded" @@ -813,7 +824,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.93", ] [[package]] @@ -995,15 +1006,15 @@ checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" [[package]] name = "glob" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" [[package]] name = "h2" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "524e8ac6999421f49a846c2d4411f337e53497d8ec55d67753beffa43c5d9205" +checksum = "ccae279728d634d083c00f6099cb58f01cc99c145b84b8be2f6c74618d79922e" dependencies = [ "atomic-waker", "bytes", @@ -1054,9 +1065,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.0" +version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" dependencies = [ "allocator-api2", "equivalent", @@ -1077,12 +1088,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "hermit-abi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" - [[package]] name = "hex" version = "0.4.3" @@ -1091,18 +1096,18 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] name = "home" -version = "0.5.9" +version = "0.5.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] name = "http" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" +checksum = "f16ca2af56261c99fba8bac40a10251ce8188205a4c448fbb745a2e4daa76fea" dependencies = [ "bytes", "fnv", @@ -1146,9 +1151,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "1.5.0" +version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbbff0a806a4728c99295b254c8838933b5b082d75e3cb70c8dab21fdfbcfa9a" +checksum = "256fb8d4bd6413123cc9d91832d78325c48ff41677595be797d90f42969beae0" dependencies = [ "bytes", "futures-channel", @@ -1166,9 +1171,9 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.27.3" +version = "0.27.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333" +checksum = "2d191583f3da1305256f22463b9bb0471acad48a4e534a5218b9963e9c1f59b2" dependencies = [ "futures-util", "http", @@ -1224,24 +1229,153 @@ dependencies = [ "cc", ] +[[package]] +name = "icu_collections" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locid" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_locid_transform" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_locid_transform_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_locid_transform_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" + +[[package]] +name = "icu_normalizer" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "utf16_iter", + "utf8_iter", + "write16", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" + +[[package]] +name = "icu_properties" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locid_transform", + "icu_properties_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" + +[[package]] +name = "icu_provider" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_provider_macros", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_provider_macros" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.93", +] + [[package]] name = "idna" -version = "0.5.0" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" dependencies = [ - "unicode-bidi", - "unicode-normalization", + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71" +dependencies = [ + "icu_normalizer", + "icu_properties", ] [[package]] name = "indexmap" -version = "2.6.0" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" +checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" dependencies = [ "equivalent", - "hashbrown 0.15.0", + "hashbrown 0.15.2", "serde", ] @@ -1265,9 +1399,9 @@ checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708" [[package]] name = "iter-read" -version = "0.3.1" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c397ca3ea05ad509c4ec451fea28b4771236a376ca1c69fd5143aae0cf8f93c4" +checksum = "071ed4cc1afd86650602c7b11aa2e1ce30762a1c27193201cb5cee9c6ebb1294" [[package]] name = "itertools" @@ -1289,9 +1423,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.11" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" [[package]] name = "itoap" @@ -1330,18 +1464,19 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.72" +version = "0.3.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" +checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7" dependencies = [ + "once_cell", "wasm-bindgen", ] [[package]] name = "libc" -version = "0.2.161" +version = "0.2.169" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" [[package]] name = "libm" @@ -1355,6 +1490,12 @@ version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +[[package]] +name = "litemap" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" + [[package]] name = "lock_api" version = "0.4.12" @@ -1458,20 +1599,19 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "miniz_oxide" -version = "0.8.0" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +checksum = "4ffbe83022cedc1d264172192511ae958937694cd57ce297164951b8b3568394" dependencies = [ "adler2", ] [[package]] name = "mio" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" +checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" dependencies = [ - "hermit-abi", "libc", "wasi", "windows-sys 0.52.0", @@ -1704,9 +1844,9 @@ dependencies = [ [[package]] name = "object" -version = "0.36.5" +version = "0.36.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" +checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" dependencies = [ "memchr", ] @@ -1755,9 +1895,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "ordered-float" -version = "4.4.0" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83e7ccb95e240b7c9506a3d544f10d935e142cc90b0a1d56954fb44d89ad6b97" +checksum = "7bb71e1b3fa6ca1c61f383464aaf2bb0e2f8e772a1f01d486832464de363b951" dependencies = [ "num-traits", ] @@ -1808,20 +1948,20 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "pest" -version = "2.7.14" +version = "2.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "879952a81a83930934cbf1786752d6dedc3b1f29e8f8fb2ad1d0a36f377cf442" +checksum = "8b7cafe60d6cf8e62e1b9b2ea516a089c008945bb5a275416789e7db0bc199dc" dependencies = [ "memchr", - "thiserror", + "thiserror 2.0.9", "ucd-trie", ] [[package]] name = "pest_derive" -version = "2.7.14" +version = "2.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d214365f632b123a47fd913301e14c946c61d1c183ee245fa76eb752e59a02dd" +checksum = "816518421cfc6887a0d62bf441b6ffb4536fcc926395a69e1a85852d4363f57e" dependencies = [ "pest", "pest_generator", @@ -1829,22 +1969,22 @@ dependencies = [ [[package]] name = "pest_generator" -version = "2.7.14" +version = "2.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb55586734301717aea2ac313f50b2eb8f60d2fc3dc01d190eefa2e625f60c4e" +checksum = "7d1396fd3a870fc7838768d171b4616d5c91f6cc25e377b673d714567d99377b" dependencies = [ "pest", "pest_meta", "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.93", ] [[package]] name = "pest_meta" -version = "2.7.14" +version = "2.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b75da2a70cf4d9cb76833c990ac9cd3923c9a8905a8929789ce347c84564d03d" +checksum = "e1e58089ea25d717bfd31fb534e4f3afcc2cc569c70de3e239778991ea3b7dea" dependencies = [ "once_cell", "pest", @@ -1952,7 +2092,7 @@ dependencies = [ "ethnum", "fast-float", "getrandom", - "hashbrown 0.15.0", + "hashbrown 0.15.2", "itoa", "itoap", "lz4", @@ -2012,7 +2152,7 @@ dependencies = [ "chrono-tz", "either", "hashbrown 0.14.5", - "hashbrown 0.15.0", + "hashbrown 0.15.2", "indexmap", "ndarray", "num-traits", @@ -2030,7 +2170,7 @@ dependencies = [ "serde", "serde_json", "strum_macros", - "thiserror", + "thiserror 1.0.69", "version_check", "xxhash-rust", ] @@ -2045,7 +2185,7 @@ dependencies = [ "polars-arrow-format", "regex", "simdutf8", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -2056,7 +2196,7 @@ checksum = "ea1b431ed816cba1120cff200f06b962748001bbb2e615ce53cfbbdf701cc136" dependencies = [ "ahash", "bitflags 2.6.0", - "hashbrown 0.15.0", + "hashbrown 0.15.2", "num-traits", "once_cell", "polars-arrow", @@ -2074,9 +2214,9 @@ dependencies = [ [[package]] name = "polars-ffi" -version = "0.44.0" +version = "0.44.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3143e11ec85f120357243d69e4581dea434639f875eec06a01b090dba096bb0f" +checksum = "11f38ddf675f605d8b5228c3e282b630112d7431c884120be10d606c705c4989" dependencies = [ "polars-arrow", "polars-core", @@ -2097,7 +2237,7 @@ dependencies = [ "fs4", "futures", "glob", - "hashbrown 0.15.0", + "hashbrown 0.15.2", "home", "itoa", "memchr", @@ -2136,7 +2276,7 @@ dependencies = [ "ahash", "chrono", "fallible-streaming-iterator", - "hashbrown 0.15.0", + "hashbrown 0.15.2", "indexmap", "itoa", "num-traits", @@ -2209,7 +2349,7 @@ dependencies = [ "chrono", "chrono-tz", "either", - "hashbrown 0.15.0", + "hashbrown 0.15.2", "hex", "indexmap", "memchr", @@ -2244,7 +2384,7 @@ dependencies = [ "ethnum", "flate2", "futures", - "hashbrown 0.15.0", + "hashbrown 0.15.2", "lz4", "num-traits", "polars-arrow", @@ -2278,7 +2418,7 @@ dependencies = [ "crossbeam-channel", "crossbeam-queue", "enum_dispatch", - "hashbrown 0.15.0", + "hashbrown 0.15.2", "num-traits", "polars-arrow", "polars-compute", @@ -2309,7 +2449,7 @@ dependencies = [ "ciborium", "either", "futures", - "hashbrown 0.15.0", + "hashbrown 0.15.2", "memmap2", "num-traits", "once_cell", @@ -2439,7 +2579,7 @@ dependencies = [ "bytemuck", "bytes", "compact_str", - "hashbrown 0.15.0", + "hashbrown 0.15.2", "indexmap", "libc", "memmap2", @@ -2464,7 +2604,7 @@ dependencies = [ "cfavml", "faer", "faer-ext", - "hashbrown 0.15.0", + "hashbrown 0.15.2", "inflections", "itertools 0.12.1", "jemallocator", @@ -2485,9 +2625,9 @@ dependencies = [ [[package]] name = "portable-atomic" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" [[package]] name = "ppv-lite86" @@ -2509,18 +2649,18 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.89" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" dependencies = [ "unicode-ident", ] [[package]] name = "psm" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa37f80ca58604976033fae9515a8a2989fc13797d953f7c04fb8fa36a11f205" +checksum = "200b9ff220857e53e184257720a14553b2f4aa02577d2ed9842d45d4b9654810" dependencies = [ "cc", ] @@ -2597,7 +2737,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.85", + "syn 2.0.93", ] [[package]] @@ -2610,7 +2750,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.85", + "syn 2.0.93", ] [[package]] @@ -2629,7 +2769,7 @@ dependencies = [ "pyo3-polars-derive", "serde", "serde-pickle", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -2643,7 +2783,7 @@ dependencies = [ "polars-plan", "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.93", ] [[package]] @@ -2658,44 +2798,47 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.5" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c7c5fdde3cdae7203427dc4f0a68fe0ed09833edc525a03456b153b79828684" +checksum = "62e96808277ec6f97351a2380e6c25114bc9e67037775464979f3037c92d05ef" dependencies = [ "bytes", "pin-project-lite", "quinn-proto", "quinn-udp", - "rustc-hash 2.0.0", + "rustc-hash 2.1.0", "rustls", "socket2", - "thiserror", + "thiserror 2.0.9", "tokio", "tracing", ] [[package]] name = "quinn-proto" -version = "0.11.8" +version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fadfaed2cd7f389d0161bb73eeb07b7b78f8691047a6f3e73caaeae55310a4a6" +checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d" dependencies = [ "bytes", + "getrandom", "rand", "ring", - "rustc-hash 2.0.0", + "rustc-hash 2.1.0", "rustls", + "rustls-pki-types", "slab", - "thiserror", + "thiserror 2.0.9", "tinyvec", "tracing", + "web-time", ] [[package]] name = "quinn-udp" -version = "0.5.6" +version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e346e016eacfff12233c243718197ca12f148c84e1e84268a896699b41c71780" +checksum = "1c40286217b4ba3a71d644d752e6a0b71f13f1b6a2c5311acfcbe0c2418ed904" dependencies = [ "cfg_aliases", "libc", @@ -2707,9 +2850,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.37" +version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" dependencies = [ "proc-macro2", ] @@ -2836,14 +2979,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.85", + "syn 2.0.93", ] [[package]] name = "redox_syscall" -version = "0.5.7" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" +checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" dependencies = [ "bitflags 2.6.0", ] @@ -2865,7 +3008,7 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.93", ] [[package]] @@ -2882,9 +3025,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", @@ -2899,9 +3042,9 @@ checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "reqwest" -version = "0.12.9" +version = "0.12.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a77c62af46e79de0a562e1a9849205ffcb7fc1238876e9bd743357570e04046f" +checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da" dependencies = [ "base64", "bytes", @@ -2933,6 +3076,7 @@ dependencies = [ "tokio", "tokio-rustls", "tokio-util", + "tower", "tower-service", "url", "wasm-bindgen", @@ -2971,9 +3115,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustc-hash" -version = "2.0.0" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" +checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" [[package]] name = "rustfft" @@ -2992,22 +3136,22 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.38" +version = "0.38.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa260229e6538e52293eeb577aabd09945a09d6d9cc0fc550ed7529056c2e32a" +checksum = "f93dc38ecbab2eb790ff964bb77fa94faf256fd3e73285fd7ba0903b76bedb85" dependencies = [ "bitflags 2.6.0", "errno", "libc", "linux-raw-sys", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] name = "rustls" -version = "0.23.16" +version = "0.23.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eee87ff5d9b36712a58574e12e9f0ea80f915a5b0ac518d322b24a465617925e" +checksum = "5065c3f250cbd332cd894be57c40fa52387247659b14a2d6041d121547903b1b" dependencies = [ "once_cell", "ring", @@ -3019,12 +3163,11 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcaf18a4f2be7326cd874a5fa579fae794320a0f388d365dca7e480e55f83f8a" +checksum = "7fcff2dd52b58a8d98a70243663a0d234c4e2b79235637849d15913394a247d3" dependencies = [ "openssl-probe", - "rustls-pemfile", "rustls-pki-types", "schannel", "security-framework", @@ -3041,9 +3184,12 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.10.0" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b" +checksum = "d2bf47e6ff922db3825eb750c4e2ff784c6ff8fb9e13046ef6a1d1c5401b0b37" +dependencies = [ + "web-time", +] [[package]] name = "rustls-webpki" @@ -3058,9 +3204,9 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.18" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248" +checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" [[package]] name = "ryu" @@ -3079,9 +3225,9 @@ dependencies = [ [[package]] name = "schannel" -version = "0.1.26" +version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01227be5826fa0690321a2ba6c5cd57a19cf3f6a09e76973b58e61de6ab9d1c1" +checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d" dependencies = [ "windows-sys 0.59.0", ] @@ -3094,9 +3240,9 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "security-framework" -version = "2.11.1" +version = "3.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" +checksum = "81d3f8c9bfcc3cbb6b0179eb57042d75b1582bdc65c3cb95f3fa999509c03cbc" dependencies = [ "bitflags 2.6.0", "core-foundation", @@ -3107,9 +3253,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.12.0" +version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea4a292869320c0272d7bc55a5a6aafaff59b4f63404a003887b679a2e05b4b6" +checksum = "1863fd3768cd83c56a7f60faa4dc0d403f1b6df0a38c3c25f44b7894e45370d5" dependencies = [ "core-foundation-sys", "libc", @@ -3123,18 +3269,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.214" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f55c3193aca71c12ad7890f1785d2b73e1b9f63a0bbc353c08ef26fe03fc56b5" +checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" dependencies = [ "serde_derive", ] [[package]] name = "serde-pickle" -version = "1.1.1" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c762ad136a26407c6a80825813600ceeab5e613660d93d79a41f0ec877171e71" +checksum = "b641fdc8bcf2781ee78b30c599700d64ad4f412976143e4c5d0b9df906bb4843" dependencies = [ "byteorder", "iter-read", @@ -3145,20 +3291,20 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.214" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de523f781f095e28fa605cdce0f8307e451cc0fd14e2eb4cd2e98a355b147766" +checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.93", ] [[package]] name = "serde_json" -version = "1.0.132" +version = "1.0.134" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" +checksum = "d00f4175c42ee48b15416f6193a959ba3a0d67fc699a0db9ad12df9f83991c7d" dependencies = [ "itoa", "memchr", @@ -3197,9 +3343,9 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "simd-json" -version = "0.14.2" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1df0290e9bfe79ddd5ff8798ca887cd107b75353d2957efe9777296e17f26b5" +checksum = "aa2bcf6c6e164e81bc7a5d49fc6988b3d515d9e8c07457d7b74ffb9324b9cd40" dependencies = [ "ahash", "getrandom", @@ -3278,9 +3424,9 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] name = "socket2" -version = "0.5.7" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" +checksum = "c970269d99b64e60ec3bd6ad27270092a5394c4e309314b18ae3fe575695fbe8" dependencies = [ "libc", "windows-sys 0.52.0", @@ -3301,6 +3447,12 @@ dependencies = [ "log", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "stacker" version = "0.1.17" @@ -3351,7 +3503,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.85", + "syn 2.0.93", ] [[package]] @@ -3373,9 +3525,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.85" +version = "2.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5023162dfcd14ef8f32034d8bcd4cc5ddc61ef7a247c024a33e24e1f24d21b56" +checksum = "9c786062daee0d6db1132800e623df74274a0a87322d8e183338e01b3d98d058" dependencies = [ "proc-macro2", "quote", @@ -3384,13 +3536,24 @@ dependencies = [ [[package]] name = "sync_wrapper" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" dependencies = [ "futures-core", ] +[[package]] +name = "synstructure" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.93", +] + [[package]] name = "sysctl" version = "0.5.5" @@ -3401,7 +3564,7 @@ dependencies = [ "byteorder", "enum-as-inner", "libc", - "thiserror", + "thiserror 1.0.69", "walkdir", ] @@ -3432,29 +3595,59 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "thiserror" -version = "1.0.65" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d11abd9594d9b38965ef50805c5e469ca9cc6f197f883f717e0269a3057b3d5" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f072643fd0190df67a8bab670c20ef5d8737177d6ac6b2e9a236cb096206b2cc" +dependencies = [ + "thiserror-impl 2.0.9", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.93", ] [[package]] name = "thiserror-impl" -version = "1.0.65" +version = "2.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae71770322cbd277e69d762a16c444af02aa0575ac0d174f0b9562d3b37f8602" +checksum = "7b50fa271071aae2e6ee85f842e2e28ba8cd2c5fb67f11fcb1fd70b276f9e7d4" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.93", +] + +[[package]] +name = "tinystr" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" +dependencies = [ + "displaydoc", + "zerovec", ] [[package]] name = "tinyvec" -version = "1.8.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" +checksum = "022db8904dfa342efe721985167e9fcd16c29b226db4397ed752a761cfce81e8" dependencies = [ "tinyvec_macros", ] @@ -3467,9 +3660,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.41.0" +version = "1.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "145f3413504347a2be84393cc8a7d2fb4d863b375909ea59f2158261aa258bbb" +checksum = "5cec9b21b0450273377fc97bd4c33a8acffc8c996c987a7c5b319a0083707551" dependencies = [ "backtrace", "bytes", @@ -3489,25 +3682,24 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.93", ] [[package]] name = "tokio-rustls" -version = "0.26.0" +version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" +checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37" dependencies = [ "rustls", - "rustls-pki-types", "tokio", ] [[package]] name = "tokio-util" -version = "0.7.12" +version = "0.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a" +checksum = "d7fcaa8d55a2bdd6b83ace262b016eca0d79ee02818c5c1bcdf0305114081078" dependencies = [ "bytes", "futures-core", @@ -3516,6 +3708,27 @@ dependencies = [ "tokio", ] +[[package]] +name = "tower" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + [[package]] name = "tower-service" version = "0.3.3" @@ -3524,9 +3737,9 @@ checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" -version = "0.1.40" +version = "0.1.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" dependencies = [ "pin-project-lite", "tracing-attributes", @@ -3535,20 +3748,20 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.27" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.93", ] [[package]] name = "tracing-core" -version = "0.1.32" +version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" dependencies = [ "once_cell", ] @@ -3581,17 +3794,11 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" -[[package]] -name = "unicode-bidi" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893" - [[package]] name = "unicode-ident" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" +checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" [[package]] name = "unicode-normalization" @@ -3631,15 +3838,27 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "url" -version = "2.5.2" +version = "2.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" dependencies = [ "form_urlencoded", "idna", "percent-encoding", ] +[[package]] +name = "utf16_iter" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "uuid" version = "1.11.0" @@ -3694,9 +3913,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e" +checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396" dependencies = [ "cfg-if", "once_cell", @@ -3705,36 +3924,36 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" +checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79" dependencies = [ "bumpalo", "log", - "once_cell", "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.93", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.45" +version = "0.4.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc7ec4f8827a71586374db3e87abdb5a2bb3a15afed140221307c3ec06b1f63b" +checksum = "38176d9b44ea84e9184eff0bc34cc167ed044f816accfe5922e54d84cf48eca2" dependencies = [ "cfg-if", "js-sys", + "once_cell", "wasm-bindgen", "web-sys", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56" +checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3742,22 +3961,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" +checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.93", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" +checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6" [[package]] name = "wasm-streams" @@ -3774,9 +3993,19 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.72" +version = "0.3.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04dd7223427d52553d3702c004d3b2fe07c148165faa56313cb00211e31c12bc" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "web-time" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" dependencies = [ "js-sys", "wasm-bindgen", @@ -3852,7 +4081,7 @@ checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.93", ] [[package]] @@ -3863,7 +4092,7 @@ checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.93", ] [[package]] @@ -3987,11 +4216,47 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "write16" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" + +[[package]] +name = "writeable" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" + [[package]] name = "xxhash-rust" -version = "0.8.12" +version = "0.8.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" + +[[package]] +name = "yoke" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a5cbf750400958819fb6178eaa83bee5cd9c29a26a40cc241df8c70fdd46984" +checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.93", + "synstructure", +] [[package]] name = "zerocopy" @@ -4011,7 +4276,28 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.93", +] + +[[package]] +name = "zerofrom" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.93", + "synstructure", ] [[package]] @@ -4020,6 +4306,28 @@ version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" +[[package]] +name = "zerovec" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.93", +] + [[package]] name = "zstd" version = "0.13.2" diff --git a/README.md b/README.md index 0af79fa9..27037c63 100644 --- a/README.md +++ b/README.md @@ -229,7 +229,7 @@ If your code already executes under 1s and you only use your code in non-product # Disclaimer -**Currently in Beta. Feel free to submit feature requests in the issues section of the repo. This library will only depend on python Polars (for most of its core) and will try to be as stable as possible for polars>=1 (It currently supports polars>=0.20.16 but that will be dropped soon). Exceptions will be made when Polars's update forces changes in the plugins.** +**Currently in Beta. Feel free to submit feature requests in the issues section of the repo. This library will only depend on python Polars (for most of its core) and will try to be as stable as possible for polars>=1. Exceptions will be made when Polars's update forces changes in the plugins.** This package is not tested with Polars streaming mode and is not designed to work with data so big that has to be streamed. diff --git a/SKLEARN_COMPATIBILITY.md b/SKLEARN_COMPATIBILITY.md index 041e94fb..6af5f35c 100644 --- a/SKLEARN_COMPATIBILITY.md +++ b/SKLEARN_COMPATIBILITY.md @@ -36,8 +36,7 @@ class CustomPDSTransformer(BaseEstimator, TransformerMixin): def fit(self, df, y=None): # specify all the rules for the transform here bp = ( - pds_pipe.Blueprint(df, name = "example", target = "approved") - .lowercase() + pds_pipe.Blueprint(df, name = "example", target = "approved", lowercase=True) .filter( "city_category is not null" # or equivalently, you can do: pl.col("city_category").is_not_null() ) diff --git a/docs/dia.md b/docs/dia.md deleted file mode 100644 index 1a9a08c8..00000000 --- a/docs/dia.md +++ /dev/null @@ -1,3 +0,0 @@ -## Data Inspection Assistant and Diagnosis - -::: polars_ds.diagnosis \ No newline at end of file diff --git a/docs/eda.md b/docs/eda.md new file mode 100644 index 00000000..cbf2c339 --- /dev/null +++ b/docs/eda.md @@ -0,0 +1,3 @@ +## Explorative Data Analysis + +::: polars_ds.eda \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index de3bd728..8db7df09 100644 --- a/docs/index.md +++ b/docs/index.md @@ -218,7 +218,7 @@ If your code already executes under 1s and you only use your code in non-product # Disclaimer -**Currently in Beta. Feel free to submit feature requests in the issues section of the repo. This library will only depend on python Polars (for most of its core) and will try to be as stable as possible for polars>=1 (It currently supports polars>=0.20.16 but that will be dropped soon). Exceptions will be made when Polars's update forces changes in the plugins.** +**Currently in Beta. Feel free to submit feature requests in the issues section of the repo. This library will only depend on python Polars (for most of its core) and will try to be as stable as possible for polars>=1. Exceptions will be made when Polars's update forces changes in the plugins.** This package is not tested with Polars streaming mode and is not designed to work with data so big that has to be streamed. diff --git a/examples/basics.ipynb b/examples/basics.ipynb index 00ba148c..09903294 100644 --- a/examples/basics.ipynb +++ b/examples/basics.ipynb @@ -46,7 +46,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 13)
ftime_idxdummyactualpredicteddummy_groupsx1x2x3abyy2
f64i64stri32f64strf64f64f64f64f64f64f64
0.00"a"10.836972"a"0.1811840.8331640.2428370.9633570.645792-0.0871060.374231
0.8414711"a"00.160224"a"0.7160970.72390.0061440.5237030.3082670.3154130.418257
0.9092972"a"00.289834"a"0.8035620.1386190.3991910.4808380.035734-0.4366030.127021
0.141123"a"00.192884"a"0.9242350.082840.0717270.8540510.9430420.0559430.150295
-0.7568024"a"00.370113"a"0.4608230.0854750.9671260.9650460.556006-1.3558570.00166
" + "shape: (5, 13)
ftime_idxdummyactualpredicteddummy_groupsx1x2x3abyy2
f64i64stri32f64strf64f64f64f64f64f64f64
0.00"a"00.870124"a"0.4996760.1169380.5879460.1319780.324241-0.7718570.058872
0.8414711"a"00.626794"a"0.7918790.0940170.3743730.4897860.504696-0.4145560.107824
0.9092972"a"00.362253"a"0.5267410.5444680.6495370.8245650.414665-0.731910.248602
0.141123"a"10.236681"a"0.1969690.2644930.7362930.427340.734808-0.9954510.071098
-0.7568024"a"00.147215"a"0.7864990.9643150.7640420.7886270.2992-0.7387820.459861
" ], "text/plain": [ "shape: (5, 13)\n", @@ -55,11 +55,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ i64 ┆ str ┆ i32 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═══════════╪══════════╪═══════╪════════╪═══╪══════════╪══════════╪═══════════╪══════════╡\n", - "│ 0.0 ┆ 0 ┆ a ┆ 1 ┆ … ┆ 0.963357 ┆ 0.645792 ┆ -0.087106 ┆ 0.374231 │\n", - "│ 0.841471 ┆ 1 ┆ a ┆ 0 ┆ … ┆ 0.523703 ┆ 0.308267 ┆ 0.315413 ┆ 0.418257 │\n", - "│ 0.909297 ┆ 2 ┆ a ┆ 0 ┆ … ┆ 0.480838 ┆ 0.035734 ┆ -0.436603 ┆ 0.127021 │\n", - "│ 0.14112 ┆ 3 ┆ a ┆ 0 ┆ … ┆ 0.854051 ┆ 0.943042 ┆ 0.055943 ┆ 0.150295 │\n", - "│ -0.756802 ┆ 4 ┆ a ┆ 0 ┆ … ┆ 0.965046 ┆ 0.556006 ┆ -1.355857 ┆ 0.00166 │\n", + "│ 0.0 ┆ 0 ┆ a ┆ 0 ┆ … ┆ 0.131978 ┆ 0.324241 ┆ -0.771857 ┆ 0.058872 │\n", + "│ 0.841471 ┆ 1 ┆ a ┆ 0 ┆ … ┆ 0.489786 ┆ 0.504696 ┆ -0.414556 ┆ 0.107824 │\n", + "│ 0.909297 ┆ 2 ┆ a ┆ 0 ┆ … ┆ 0.824565 ┆ 0.414665 ┆ -0.73191 ┆ 0.248602 │\n", + "│ 0.14112 ┆ 3 ┆ a ┆ 1 ┆ … ┆ 0.42734 ┆ 0.734808 ┆ -0.995451 ┆ 0.071098 │\n", + "│ -0.756802 ┆ 4 ┆ a ┆ 0 ┆ … ┆ 0.788627 ┆ 0.2992 ┆ -0.738782 ┆ 0.459861 │\n", "└───────────┴──────────┴───────┴────────┴───┴──────────┴──────────┴───────────┴──────────┘" ] }, @@ -218,7 +218,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 3)
fab
f64f64f64
1.3944e-15-0.963357-0.645792
-0.841471-0.523703-0.308267
-0.909297-0.480838-0.035734
-0.14112-0.854051-0.943042
0.756802-0.0016880.089786
" + "shape: (5, 3)
fab
f64f64f64
1.3944e-15-0.131978-0.324241
-0.841471-0.489786-0.504696
-0.909297-0.824565-0.414665
-0.14112-0.42734-0.734808
0.756802-0.6566490.025041
" ], "text/plain": [ "shape: (5, 3)\n", @@ -227,11 +227,11 @@ "│ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 │\n", "╞════════════╪═══════════╪═══════════╡\n", - "│ 1.3944e-15 ┆ -0.963357 ┆ -0.645792 │\n", - "│ -0.841471 ┆ -0.523703 ┆ -0.308267 │\n", - "│ -0.909297 ┆ -0.480838 ┆ -0.035734 │\n", - "│ -0.14112 ┆ -0.854051 ┆ -0.943042 │\n", - "│ 0.756802 ┆ -0.001688 ┆ 0.089786 │\n", + "│ 1.3944e-15 ┆ -0.131978 ┆ -0.324241 │\n", + "│ -0.841471 ┆ -0.489786 ┆ -0.504696 │\n", + "│ -0.909297 ┆ -0.824565 ┆ -0.414665 │\n", + "│ -0.14112 ┆ -0.42734 ┆ -0.734808 │\n", + "│ 0.756802 ┆ -0.656649 ┆ 0.025041 │\n", "└────────────┴───────────┴───────────┘" ] }, @@ -268,7 +268,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 1)
coeffs
list[f64]
[-0.500734, -0.338584]
" + "shape: (1, 1)
coeffs
list[f64]
[-0.497998, -0.333503]
" ], "text/plain": [ "shape: (1, 1)\n", @@ -277,7 +277,7 @@ "│ --- │\n", "│ list[f64] │\n", "╞════════════════════════╡\n", - "│ [-0.500734, -0.338584] │\n", + "│ [-0.497998, -0.333503] │\n", "└────────────────────────┘" ] }, @@ -313,17 +313,17 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 2)
target_0target_1
list[f64]list[f64]
[-0.500734, -0.338584][0.086658, 0.407468]
" + "shape: (1, 2)
target_0target_1
list[f64]list[f64]
[-0.497998, -0.333503][0.08684, 0.407807]
" ], "text/plain": [ "shape: (1, 2)\n", - "┌────────────────────────┬──────────────────────┐\n", - "│ target_0 ┆ target_1 │\n", - "│ --- ┆ --- │\n", - "│ list[f64] ┆ list[f64] │\n", - "╞════════════════════════╪══════════════════════╡\n", - "│ [-0.500734, -0.338584] ┆ [0.086658, 0.407468] │\n", - "└────────────────────────┴──────────────────────┘" + "┌────────────────────────┬─────────────────────┐\n", + "│ target_0 ┆ target_1 │\n", + "│ --- ┆ --- │\n", + "│ list[f64] ┆ list[f64] │\n", + "╞════════════════════════╪═════════════════════╡\n", + "│ [-0.497998, -0.333503] ┆ [0.08684, 0.407807] │\n", + "└────────────────────────┴─────────────────────┘" ] }, "execution_count": 8, @@ -358,7 +358,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (4, 7)
featuresbetastd_errtp>|t|0.0250.975
strf64f64f64f64f64f64
"ln(x1+1)"0.2200870.001678131.1407370.00.2167970.223376
"exp(x2)"0.1744490.000676258.1797750.00.1731250.175774
"sin(x3)"-1.7457810.001346-1297.0839540.0-1.748419-1.743142
"__bias__"-0.1069510.0015-71.2928130.0-0.109891-0.10401
" + "shape: (4, 7)
featuresbetastd_errtp>|t|0.0250.975
strf64f64f64f64f64f64
"ln(x1+1)"0.2191730.001673131.0060170.00.2158930.222452
"exp(x2)"0.1755140.000678258.8364170.00.1741850.176843
"sin(x3)"-1.7411220.001334-1304.7829090.0-1.743738-1.738507
"__bias__"-0.1104940.001485-74.4087520.0-0.113404-0.107583
" ], "text/plain": [ "shape: (4, 7)\n", @@ -367,10 +367,10 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞══════════╪═══════════╪══════════╪══════════════╪═══════╪═══════════╪═══════════╡\n", - "│ ln(x1+1) ┆ 0.220087 ┆ 0.001678 ┆ 131.140737 ┆ 0.0 ┆ 0.216797 ┆ 0.223376 │\n", - "│ exp(x2) ┆ 0.174449 ┆ 0.000676 ┆ 258.179775 ┆ 0.0 ┆ 0.173125 ┆ 0.175774 │\n", - "│ sin(x3) ┆ -1.745781 ┆ 0.001346 ┆ -1297.083954 ┆ 0.0 ┆ -1.748419 ┆ -1.743142 │\n", - "│ __bias__ ┆ -0.106951 ┆ 0.0015 ┆ -71.292813 ┆ 0.0 ┆ -0.109891 ┆ -0.10401 │\n", + "│ ln(x1+1) ┆ 0.219173 ┆ 0.001673 ┆ 131.006017 ┆ 0.0 ┆ 0.215893 ┆ 0.222452 │\n", + "│ exp(x2) ┆ 0.175514 ┆ 0.000678 ┆ 258.836417 ┆ 0.0 ┆ 0.174185 ┆ 0.176843 │\n", + "│ sin(x3) ┆ -1.741122 ┆ 0.001334 ┆ -1304.782909 ┆ 0.0 ┆ -1.743738 ┆ -1.738507 │\n", + "│ __bias__ ┆ -0.110494 ┆ 0.001485 ┆ -74.408752 ┆ 0.0 ┆ -0.113404 ┆ -0.107583 │\n", "└──────────┴───────────┴──────────┴──────────────┴───────┴───────────┴───────────┘" ] }, @@ -407,7 +407,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (10_000, 2)
dummycoeffs
strlist[f64]
"a"[-0.479674, -0.344547]
"a"[-0.479674, -0.344547]
"a"[-0.479674, -0.344547]
"a"[-0.479674, -0.344547]
"a"[-0.479674, -0.344547]
"b"[-0.5218, -0.33279]
"b"[-0.5218, -0.33279]
"b"[-0.5218, -0.33279]
"b"[-0.5218, -0.33279]
"b"[-0.5218, -0.33279]
" + "shape: (10_000, 2)
dummycoeffs
strlist[f64]
"a"[-0.491841, -0.342778]
"a"[-0.491841, -0.342778]
"a"[-0.491841, -0.342778]
"a"[-0.491841, -0.342778]
"a"[-0.491841, -0.342778]
"b"[-0.504246, -0.324211]
"b"[-0.504246, -0.324211]
"b"[-0.504246, -0.324211]
"b"[-0.504246, -0.324211]
"b"[-0.504246, -0.324211]
" ], "text/plain": [ "shape: (10_000, 2)\n", @@ -416,17 +416,17 @@ "│ --- ┆ --- │\n", "│ str ┆ list[f64] │\n", "╞═══════╪════════════════════════╡\n", - "│ a ┆ [-0.479674, -0.344547] │\n", - "│ a ┆ [-0.479674, -0.344547] │\n", - "│ a ┆ [-0.479674, -0.344547] │\n", - "│ a ┆ [-0.479674, -0.344547] │\n", - "│ a ┆ [-0.479674, -0.344547] │\n", + "│ a ┆ [-0.491841, -0.342778] │\n", + "│ a ┆ [-0.491841, -0.342778] │\n", + "│ a ┆ [-0.491841, -0.342778] │\n", + "│ a ┆ [-0.491841, -0.342778] │\n", + "│ a ┆ [-0.491841, -0.342778] │\n", "│ … ┆ … │\n", - "│ b ┆ [-0.5218, -0.33279] │\n", - "│ b ┆ [-0.5218, -0.33279] │\n", - "│ b ┆ [-0.5218, -0.33279] │\n", - "│ b ┆ [-0.5218, -0.33279] │\n", - "│ b ┆ [-0.5218, -0.33279] │\n", + "│ b ┆ [-0.504246, -0.324211] │\n", + "│ b ┆ [-0.504246, -0.324211] │\n", + "│ b ┆ [-0.504246, -0.324211] │\n", + "│ b ┆ [-0.504246, -0.324211] │\n", + "│ b ┆ [-0.504246, -0.324211] │\n", "└───────┴────────────────────────┘" ] }, @@ -462,7 +462,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 5)
x1x2ypredresid
f64f64f64f64f64
0.1811840.833164-0.087106-0.3728210.285715
0.7160970.72390.315413-0.6036740.919088
0.8035620.138619-0.436603-0.4493040.012702
0.9242350.082840.055943-0.4908440.546787
0.4608230.085475-1.355857-0.25969-1.096167
" + "shape: (5, 5)
x1x2ypredresid
f64f64f64f64f64
0.4996760.116938-0.771857-0.287837-0.48402
0.7918790.094017-0.414556-0.4257090.011154
0.5267410.544468-0.73191-0.443897-0.288013
0.1969690.264493-0.995451-0.186299-0.809152
0.7864990.964315-0.738782-0.713276-0.025506
" ], "text/plain": [ "shape: (5, 5)\n", @@ -471,11 +471,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞══════════╪══════════╪═══════════╪═══════════╪═══════════╡\n", - "│ 0.181184 ┆ 0.833164 ┆ -0.087106 ┆ -0.372821 ┆ 0.285715 │\n", - "│ 0.716097 ┆ 0.7239 ┆ 0.315413 ┆ -0.603674 ┆ 0.919088 │\n", - "│ 0.803562 ┆ 0.138619 ┆ -0.436603 ┆ -0.449304 ┆ 0.012702 │\n", - "│ 0.924235 ┆ 0.08284 ┆ 0.055943 ┆ -0.490844 ┆ 0.546787 │\n", - "│ 0.460823 ┆ 0.085475 ┆ -1.355857 ┆ -0.25969 ┆ -1.096167 │\n", + "│ 0.499676 ┆ 0.116938 ┆ -0.771857 ┆ -0.287837 ┆ -0.48402 │\n", + "│ 0.791879 ┆ 0.094017 ┆ -0.414556 ┆ -0.425709 ┆ 0.011154 │\n", + "│ 0.526741 ┆ 0.544468 ┆ -0.73191 ┆ -0.443897 ┆ -0.288013 │\n", + "│ 0.196969 ┆ 0.264493 ┆ -0.995451 ┆ -0.186299 ┆ -0.809152 │\n", + "│ 0.786499 ┆ 0.964315 ┆ -0.738782 ┆ -0.713276 ┆ -0.025506 │\n", "└──────────┴──────────┴───────────┴───────────┴───────────┘" ] }, @@ -515,7 +515,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (2, 2)
dummycoeffs
strlist[f64]
"a"[-0.479674, -0.344547]
"b"[-0.5218, -0.33279]
" + "shape: (2, 2)
dummycoeffs
strlist[f64]
"a"[-0.491841, -0.342778]
"b"[-0.504246, -0.324211]
" ], "text/plain": [ "shape: (2, 2)\n", @@ -524,8 +524,8 @@ "│ --- ┆ --- │\n", "│ str ┆ list[f64] │\n", "╞═══════╪════════════════════════╡\n", - "│ a ┆ [-0.479674, -0.344547] │\n", - "│ b ┆ [-0.5218, -0.33279] │\n", + "│ a ┆ [-0.491841, -0.342778] │\n", + "│ b ┆ [-0.504246, -0.324211] │\n", "└───────┴────────────────────────┘" ] }, @@ -560,7 +560,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (2, 2)
dummycoeffs
strlist[f64]
"a"[-0.299928, -0.187761]
"b"[-0.347887, -0.161111]
" + "shape: (2, 2)
dummycoeffs
strlist[f64]
"a"[-0.329471, -0.157246]
"b"[-0.338692, -0.144217]
" ], "text/plain": [ "shape: (2, 2)\n", @@ -569,8 +569,8 @@ "│ --- ┆ --- │\n", "│ str ┆ list[f64] │\n", "╞═══════╪════════════════════════╡\n", - "│ a ┆ [-0.299928, -0.187761] │\n", - "│ b ┆ [-0.347887, -0.161111] │\n", + "│ a ┆ [-0.329471, -0.157246] │\n", + "│ b ┆ [-0.338692, -0.144217] │\n", "└───────┴────────────────────────┘" ] }, @@ -607,7 +607,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (2, 2)
dummylasso_r2
strf64
"a"-0.533955
"b"-0.547336
" + "shape: (2, 2)
dummylasso_r2
strf64
"a"-0.539511
"b"-0.52702
" ], "text/plain": [ "shape: (2, 2)\n", @@ -616,8 +616,8 @@ "│ --- ┆ --- │\n", "│ str ┆ f64 │\n", "╞═══════╪═══════════╡\n", - "│ a ┆ -0.533955 │\n", - "│ b ┆ -0.547336 │\n", + "│ a ┆ -0.539511 │\n", + "│ b ┆ -0.52702 │\n", "└───────┴───────────┘" ] }, @@ -658,7 +658,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (10_000, 5)
yx1x2coeffspred
f64f64f64list[f64]f64
-0.0871060.1811840.833164nullnull
0.3154130.7160970.7239nullnull
-0.4366030.8035620.138619nullnull
0.0559430.9242350.08284nullnull
-1.3558570.4608230.085475[-0.434778, 0.298689]-0.174825
-0.7858110.2978070.209961[-1.72997, -0.058576]-0.527496
-0.3917380.261320.040594[-1.856937, 0.204615]-0.476948
-0.4727050.8816790.572239[0.021692, -1.521368]-0.851461
-0.413730.9334420.189696[-0.181844, -1.202663]-0.397881
-0.0586460.1536290.836968[-0.610511, -0.035761]-0.123723
" + "shape: (10_000, 5)
yx1x2coeffspred
f64f64f64list[f64]f64
-0.7718570.4996760.116938nullnull
-0.4145560.7918790.094017nullnull
-0.731910.5267410.544468nullnull
-0.9954510.1969690.264493nullnull
-0.7387820.7864990.964315[-0.714089, -0.473885]-1.018605
0.1268560.1269460.964879[-1.072946, 0.029]-0.108224
-1.3136610.8992510.051631[-1.268071, 0.358157]-1.121822
-0.5508320.9114170.220975[-1.06123, 0.355866]-0.888586
-1.1935380.3333150.54282[-1.067394, 0.007257]-0.351839
-1.3640330.1383820.379413[-1.131432, -0.386638]-0.303265
" ], "text/plain": [ "shape: (10_000, 5)\n", @@ -667,17 +667,17 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ list[f64] ┆ f64 │\n", "╞═══════════╪══════════╪══════════╪════════════════════════╪═══════════╡\n", - "│ -0.087106 ┆ 0.181184 ┆ 0.833164 ┆ null ┆ null │\n", - "│ 0.315413 ┆ 0.716097 ┆ 0.7239 ┆ null ┆ null │\n", - "│ -0.436603 ┆ 0.803562 ┆ 0.138619 ┆ null ┆ null │\n", - "│ 0.055943 ┆ 0.924235 ┆ 0.08284 ┆ null ┆ null │\n", - "│ -1.355857 ┆ 0.460823 ┆ 0.085475 ┆ [-0.434778, 0.298689] ┆ -0.174825 │\n", + "│ -0.771857 ┆ 0.499676 ┆ 0.116938 ┆ null ┆ null │\n", + "│ -0.414556 ┆ 0.791879 ┆ 0.094017 ┆ null ┆ null │\n", + "│ -0.73191 ┆ 0.526741 ┆ 0.544468 ┆ null ┆ null │\n", + "│ -0.995451 ┆ 0.196969 ┆ 0.264493 ┆ null ┆ null │\n", + "│ -0.738782 ┆ 0.786499 ┆ 0.964315 ┆ [-0.714089, -0.473885] ┆ -1.018605 │\n", "│ … ┆ … ┆ … ┆ … ┆ … │\n", - "│ -0.785811 ┆ 0.297807 ┆ 0.209961 ┆ [-1.72997, -0.058576] ┆ -0.527496 │\n", - "│ -0.391738 ┆ 0.26132 ┆ 0.040594 ┆ [-1.856937, 0.204615] ┆ -0.476948 │\n", - "│ -0.472705 ┆ 0.881679 ┆ 0.572239 ┆ [0.021692, -1.521368] ┆ -0.851461 │\n", - "│ -0.41373 ┆ 0.933442 ┆ 0.189696 ┆ [-0.181844, -1.202663] ┆ -0.397881 │\n", - "│ -0.058646 ┆ 0.153629 ┆ 0.836968 ┆ [-0.610511, -0.035761] ┆ -0.123723 │\n", + "│ 0.126856 ┆ 0.126946 ┆ 0.964879 ┆ [-1.072946, 0.029] ┆ -0.108224 │\n", + "│ -1.313661 ┆ 0.899251 ┆ 0.051631 ┆ [-1.268071, 0.358157] ┆ -1.121822 │\n", + "│ -0.550832 ┆ 0.911417 ┆ 0.220975 ┆ [-1.06123, 0.355866] ┆ -0.888586 │\n", + "│ -1.193538 ┆ 0.333315 ┆ 0.54282 ┆ [-1.067394, 0.007257] ┆ -0.351839 │\n", + "│ -1.364033 ┆ 0.138382 ┆ 0.379413 ┆ [-1.131432, -0.386638] ┆ -0.303265 │\n", "└───────────┴──────────┴──────────┴────────────────────────┴───────────┘" ] }, @@ -758,7 +758,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 1)
a
list[f64]
[29.073839, 28.893157, 28.404245]
" + "shape: (1, 1)
a
list[f64]
[29.123673, 28.887074, 28.317706]
" ], "text/plain": [ "shape: (1, 1)\n", @@ -767,7 +767,7 @@ "│ --- │\n", "│ list[f64] │\n", "╞═════════════════════════════════╡\n", - "│ [29.073839, 28.893157, 28.4042… │\n", + "│ [29.123673, 28.887074, 28.3177… │\n", "└─────────────────────────────────┘" ] }, @@ -799,7 +799,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (2, 2)
singular_valueweight_vector
f64list[f64]
29.015447[0.568763, 0.822502]
28.458258[0.822502, -0.568763]
" + "shape: (2, 2)
singular_valueweight_vector
f64list[f64]
29.0503[0.835278, -0.549827]
28.458703[0.549827, 0.835278]
" ], "text/plain": [ "shape: (2, 2)\n", @@ -808,8 +808,8 @@ "│ --- ┆ --- │\n", "│ f64 ┆ list[f64] │\n", "╞════════════════╪═══════════════════════╡\n", - "│ 29.015447 ┆ [0.568763, 0.822502] │\n", - "│ 28.458258 ┆ [0.822502, -0.568763] │\n", + "│ 29.0503 ┆ [0.835278, -0.549827] │\n", + "│ 28.458703 ┆ [0.549827, 0.835278] │\n", "└────────────────┴───────────────────────┘" ] }, @@ -841,7 +841,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 1)
pc1
f64
0.380626
-0.147048
-0.395586
0.562945
0.307737
" + "shape: (5, 1)
pc1
f64
-0.210686
-0.011035
0.318099
-0.189717
0.351567
" ], "text/plain": [ "shape: (5, 1)\n", @@ -850,11 +850,11 @@ "│ --- │\n", "│ f64 │\n", "╞═══════════╡\n", - "│ 0.380626 │\n", - "│ -0.147048 │\n", - "│ -0.395586 │\n", - "│ 0.562945 │\n", - "│ 0.307737 │\n", + "│ -0.210686 │\n", + "│ -0.011035 │\n", + "│ 0.318099 │\n", + "│ -0.189717 │\n", + "│ 0.351567 │\n", "└───────────┘" ] }, @@ -894,7 +894,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (2, 8)
dummy_groupsl2log lossprecisionrecallfaverage_precisionroc_auc
strf64f64f64f64f64f64f64
"a"0.335440.9979450.4964790.5015810.4990170.4984350.493508
"b"0.3329561.0010330.5188980.5092740.5140410.5125840.500236
" + "shape: (2, 8)
dummy_groupsl2log lossprecisionrecallfaverage_precisionroc_auc
strf64f64f64f64f64f64f64
"b"0.3343221.0017520.4903120.5067430.4983920.4836510.496841
"a"0.330180.9946060.5080130.5009880.5044760.5139710.508563
" ], "text/plain": [ "shape: (2, 8)\n", @@ -904,8 +904,8 @@ "│ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ --- ┆ f64 │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ f64 ┆ │\n", "╞══════════════╪══════════╪══════════╪═══════════╪══════════╪══════════╪════════════════╪══════════╡\n", - "│ a ┆ 0.33544 ┆ 0.997945 ┆ 0.496479 ┆ 0.501581 ┆ 0.499017 ┆ 0.498435 ┆ 0.493508 │\n", - "│ b ┆ 0.332956 ┆ 1.001033 ┆ 0.518898 ┆ 0.509274 ┆ 0.514041 ┆ 0.512584 ┆ 0.500236 │\n", + "│ b ┆ 0.334322 ┆ 1.001752 ┆ 0.490312 ┆ 0.506743 ┆ 0.498392 ┆ 0.483651 ┆ 0.496841 │\n", + "│ a ┆ 0.33018 ┆ 0.994606 ┆ 0.508013 ┆ 0.500988 ┆ 0.504476 ┆ 0.513971 ┆ 0.508563 │\n", "└──────────────┴──────────┴──────────┴───────────┴──────────┴──────────┴────────────────┴──────────┘" ] }, @@ -993,7 +993,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 1)
sen
str
"world"
"church"
"hello"
"going"
"to"
" + "shape: (5, 1)
sen
str
"world"
"going"
"church"
"to"
"hello"
" ], "text/plain": [ "shape: (5, 1)\n", @@ -1003,10 +1003,10 @@ "│ str │\n", "╞════════╡\n", "│ world │\n", - "│ church │\n", - "│ hello │\n", "│ going │\n", + "│ church │\n", "│ to │\n", + "│ hello │\n", "└────────┘" ] }, @@ -1038,7 +1038,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 1)
sen
str
"hello"
""
"world"
"go"
"church"
" + "shape: (5, 1)
sen
str
""
"hello"
"world"
"go"
"church"
" ], "text/plain": [ "shape: (5, 1)\n", @@ -1047,8 +1047,8 @@ "│ --- │\n", "│ str │\n", "╞════════╡\n", - "│ hello │\n", "│ │\n", + "│ hello │\n", "│ world │\n", "│ go │\n", "│ church │\n", @@ -1421,7 +1421,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 1)
a
f64
null
null
-0.667205
-0.004369
-1.539039
" + "shape: (5, 1)
a
f64
null
null
0.620102
0.354738
-1.146126
" ], "text/plain": [ "shape: (5, 1)\n", @@ -1432,9 +1432,9 @@ "╞═══════════╡\n", "│ null │\n", "│ null │\n", - "│ -0.667205 │\n", - "│ -0.004369 │\n", - "│ -1.539039 │\n", + "│ 0.620102 │\n", + "│ 0.354738 │\n", + "│ -1.146126 │\n", "└───────────┘" ] }, @@ -1468,7 +1468,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 3)
arandom_normalrandom_normal_that_respects_null_of_a
f64f64f64
null0.766622null
null0.626792null
-0.6672051.568425-1.028465
-0.004369-0.8467361.131894
-1.539039-0.9944061.053838
" + "shape: (5, 3)
arandom_normalrandom_normal_that_respects_null_of_a
f64f64f64
null1.48806null
null1.578787null
0.6201021.1978071.4971
0.3547381.3159511.310793
-1.1461260.020791-0.163486
" ], "text/plain": [ "shape: (5, 3)\n", @@ -1477,11 +1477,11 @@ "│ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 │\n", "╞═══════════╪═══════════════╪═════════════════════════════════╡\n", - "│ null ┆ 0.766622 ┆ null │\n", - "│ null ┆ 0.626792 ┆ null │\n", - "│ -0.667205 ┆ 1.568425 ┆ -1.028465 │\n", - "│ -0.004369 ┆ -0.846736 ┆ 1.131894 │\n", - "│ -1.539039 ┆ -0.994406 ┆ 1.053838 │\n", + "│ null ┆ 1.48806 ┆ null │\n", + "│ null ┆ 1.578787 ┆ null │\n", + "│ 0.620102 ┆ 1.197807 ┆ 1.4971 │\n", + "│ 0.354738 ┆ 1.315951 ┆ 1.310793 │\n", + "│ -1.146126 ┆ 0.020791 ┆ -0.163486 │\n", "└───────────┴───────────────┴─────────────────────────────────┘" ] }, @@ -1516,7 +1516,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 3)
arandom_strrandom_str_that_respects_null_of_a
f64strstr
null"FL"null
null"ftnwd"null
-0.667205"7YV""o1"
-0.004369"G7""Ys"
-1.539039"Jd4""3umWr"
" + "shape: (5, 3)
arandom_strrandom_str_that_respects_null_of_a
f64strstr
null"Tyl"null
null"9o"null
0.620102"j8""tIAdD"
0.354738"aqGdl""yqXA"
-1.146126"9y""BaOP"
" ], "text/plain": [ "shape: (5, 3)\n", @@ -1525,11 +1525,11 @@ "│ --- ┆ --- ┆ --- │\n", "│ f64 ┆ str ┆ str │\n", "╞═══════════╪════════════╪═════════════════════════════════╡\n", - "│ null ┆ FL ┆ null │\n", - "│ null ┆ ftnwd ┆ null │\n", - "│ -0.667205 ┆ 7YV ┆ o1 │\n", - "│ -0.004369 ┆ G7 ┆ Ys │\n", - "│ -1.539039 ┆ Jd4 ┆ 3umWr │\n", + "│ null ┆ Tyl ┆ null │\n", + "│ null ┆ 9o ┆ null │\n", + "│ 0.620102 ┆ j8 ┆ tIAdD │\n", + "│ 0.354738 ┆ aqGdl ┆ yqXA │\n", + "│ -1.146126 ┆ 9y ┆ BaOP │\n", "└───────────┴────────────┴─────────────────────────────────┘" ] }, @@ -1564,7 +1564,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 2)
arandom_str
f64str
nullnull
nullnull
-0.667205"hIQx3"
-0.004369"OZsZn"
-1.539039"OXelh"
" + "shape: (5, 2)
arandom_str
f64str
nullnull
nullnull
0.620102"Grcv7"
0.354738"FnKt9"
-1.146126"CifSX"
" ], "text/plain": [ "shape: (5, 2)\n", @@ -1575,9 +1575,9 @@ "╞═══════════╪════════════╡\n", "│ null ┆ null │\n", "│ null ┆ null │\n", - "│ -0.667205 ┆ hIQx3 │\n", - "│ -0.004369 ┆ OZsZn │\n", - "│ -1.539039 ┆ OXelh │\n", + "│ 0.620102 ┆ Grcv7 │\n", + "│ 0.354738 ┆ FnKt9 │\n", + "│ -1.146126 ┆ CifSX │\n", "└───────────┴────────────┘" ] }, @@ -1611,7 +1611,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 4)
atest1literaltest1_perturbed
f64f64f64f64
null-0.647906null-0.648101
null0.721174null0.721425
-0.6672050.6104711.4766930.610372
-0.004369-0.0545580.705794-0.054194
-1.5390390.2661830.0603740.266557
" + "shape: (5, 4)
atest1literaltest1_perturbed
f64f64f64f64
null-1.354802null-1.355085
null0.395681null0.39548
0.620102-1.4689481.766922-1.469441
0.354738-0.8347840.233956-0.835179
-1.1461260.2017550.7109310.201758
" ], "text/plain": [ "shape: (5, 4)\n", @@ -1620,11 +1620,11 @@ "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═══════════╪═══════════╪══════════╪═════════════════╡\n", - "│ null ┆ -0.647906 ┆ null ┆ -0.648101 │\n", - "│ null ┆ 0.721174 ┆ null ┆ 0.721425 │\n", - "│ -0.667205 ┆ 0.610471 ┆ 1.476693 ┆ 0.610372 │\n", - "│ -0.004369 ┆ -0.054558 ┆ 0.705794 ┆ -0.054194 │\n", - "│ -1.539039 ┆ 0.266183 ┆ 0.060374 ┆ 0.266557 │\n", + "│ null ┆ -1.354802 ┆ null ┆ -1.355085 │\n", + "│ null ┆ 0.395681 ┆ null ┆ 0.39548 │\n", + "│ 0.620102 ┆ -1.468948 ┆ 1.766922 ┆ -1.469441 │\n", + "│ 0.354738 ┆ -0.834784 ┆ 0.233956 ┆ -0.835179 │\n", + "│ -1.146126 ┆ 0.201755 ┆ 0.710931 ┆ 0.201758 │\n", "└───────────┴───────────┴──────────┴─────────────────┘" ] }, @@ -1663,7 +1663,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 4)
a[0, 1)NormalInt from [0, 10)
f64f64f64i32
null0.6850050.3662168
null0.24877-1.1890344
-0.6672050.890820.9462594
-0.0043690.9515030.6068959
-1.5390390.707259-0.9071757
" + "shape: (5, 4)
a[0, 1)NormalInt from [0, 10)
f64f64f64i32
null0.537011-0.5937832
null0.556977-1.175227
0.6201020.743510.9774379
0.3547380.722634-1.3241357
-1.1461260.192882-0.1445448
" ], "text/plain": [ "shape: (5, 4)\n", @@ -1672,11 +1672,11 @@ "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ i32 │\n", "╞═══════════╪══════════╪═══════════╪══════════════════╡\n", - "│ null ┆ 0.685005 ┆ 0.366216 ┆ 8 │\n", - "│ null ┆ 0.24877 ┆ -1.189034 ┆ 4 │\n", - "│ -0.667205 ┆ 0.89082 ┆ 0.946259 ┆ 4 │\n", - "│ -0.004369 ┆ 0.951503 ┆ 0.606895 ┆ 9 │\n", - "│ -1.539039 ┆ 0.707259 ┆ -0.907175 ┆ 7 │\n", + "│ null ┆ 0.537011 ┆ -0.593783 ┆ 2 │\n", + "│ null ┆ 0.556977 ┆ -1.17522 ┆ 7 │\n", + "│ 0.620102 ┆ 0.74351 ┆ 0.977437 ┆ 9 │\n", + "│ 0.354738 ┆ 0.722634 ┆ -1.324135 ┆ 7 │\n", + "│ -1.146126 ┆ 0.192882 ┆ -0.144544 ┆ 8 │\n", "└───────────┴──────────┴───────────┴──────────────────┘" ] }, @@ -1711,7 +1711,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 4)
t-tests: statisticst-tests: pvaluenormality_test: statisticsnormality_test: pvalue
f64f64f64f64
-0.9410260.3468441.4200340.491636
" + "shape: (1, 4)
t-tests: statisticst-tests: pvaluenormality_test: statisticsnormality_test: pvalue
f64f64f64f64
0.8066910.4199731.5360180.463936
" ], "text/plain": [ "shape: (1, 4)\n", @@ -1720,7 +1720,7 @@ "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════════════════════╪═════════════════╪════════════════════════════╪════════════════════════╡\n", - "│ -0.941026 ┆ 0.346844 ┆ 1.420034 ┆ 0.491636 │\n", + "│ 0.806691 ┆ 0.419973 ┆ 1.536018 ┆ 0.463936 │\n", "└─────────────────────┴─────────────────┴────────────────────────────┴────────────────────────┘" ] }, @@ -1764,7 +1764,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 5)
market_idvar1var2category_1category_2
i64f64f64i32i32
00.8429720.45036434
10.6256630.09508306
20.0292550.51552834
00.7825690.66447814
10.4871030.93536107
" + "shape: (5, 5)
market_idvar1var2category_1category_2
i64f64f64i32i32
00.7608530.16641728
10.1496070.88688621
20.3780870.70964821
00.0126710.78544533
10.4510610.889531
" ], "text/plain": [ "shape: (5, 5)\n", @@ -1773,11 +1773,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ f64 ┆ f64 ┆ i32 ┆ i32 │\n", "╞═══════════╪══════════╪══════════╪════════════╪════════════╡\n", - "│ 0 ┆ 0.842972 ┆ 0.450364 ┆ 3 ┆ 4 │\n", - "│ 1 ┆ 0.625663 ┆ 0.095083 ┆ 0 ┆ 6 │\n", - "│ 2 ┆ 0.029255 ┆ 0.515528 ┆ 3 ┆ 4 │\n", - "│ 0 ┆ 0.782569 ┆ 0.664478 ┆ 1 ┆ 4 │\n", - "│ 1 ┆ 0.487103 ┆ 0.935361 ┆ 0 ┆ 7 │\n", + "│ 0 ┆ 0.760853 ┆ 0.166417 ┆ 2 ┆ 8 │\n", + "│ 1 ┆ 0.149607 ┆ 0.886886 ┆ 2 ┆ 1 │\n", + "│ 2 ┆ 0.378087 ┆ 0.709648 ┆ 2 ┆ 1 │\n", + "│ 0 ┆ 0.012671 ┆ 0.785445 ┆ 3 ┆ 3 │\n", + "│ 1 ┆ 0.451061 ┆ 0.8895 ┆ 3 ┆ 1 │\n", "└───────────┴──────────┴──────────┴────────────┴────────────┘" ] }, @@ -1817,17 +1817,17 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 3)
t-testchi2-testf-test
struct[2]struct[2]struct[2]
{1.550123,0.121144}{22.889401,0.955922}{1.746959,0.136718}
" + "shape: (1, 3)
t-testchi2-testf-test
struct[2]struct[2]struct[2]
{-0.236185,0.813294}{44.257796,0.162397}{0.284087,0.888447}
" ], "text/plain": [ "shape: (1, 3)\n", - "┌─────────────────────┬──────────────────────┬─────────────────────┐\n", - "│ t-test ┆ chi2-test ┆ f-test │\n", - "│ --- ┆ --- ┆ --- │\n", - "│ struct[2] ┆ struct[2] ┆ struct[2] │\n", - "╞═════════════════════╪══════════════════════╪═════════════════════╡\n", - "│ {1.550123,0.121144} ┆ {22.889401,0.955922} ┆ {1.746959,0.136718} │\n", - "└─────────────────────┴──────────────────────┴─────────────────────┘" + "┌──────────────────────┬──────────────────────┬─────────────────────┐\n", + "│ t-test ┆ chi2-test ┆ f-test │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ struct[2] ┆ struct[2] ┆ struct[2] │\n", + "╞══════════════════════╪══════════════════════╪═════════════════════╡\n", + "│ {-0.236185,0.813294} ┆ {44.257796,0.162397} ┆ {0.284087,0.888447} │\n", + "└──────────────────────┴──────────────────────┴─────────────────────┘" ] }, "execution_count": 40, @@ -1860,9 +1860,9 @@ "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ struct[2] ┆ struct[2] ┆ struct[2] │\n", "╞═══════════╪══════════════════════╪══════════════════════╪═════════════════════╡\n", - "│ 0 ┆ {2.182824,0.029118} ┆ {38.555846,0.35473} ┆ {1.597169,0.172493} │\n", - "│ 1 ┆ {0.972503,0.330871} ┆ {26.35273,0.880507} ┆ {0.864623,0.484451} │\n", - "│ 2 ┆ {-0.471616,0.637232} ┆ {34.267769,0.551134} ┆ {0.783499,0.535832} │\n", + "│ 0 ┆ {-0.035991,0.971292} ┆ {49.261984,0.069371} ┆ {1.242167,0.290986} │\n", + "│ 1 ┆ {-1.297004,0.19472} ┆ {52.860716,0.034598} ┆ {0.558786,0.69263} │\n", + "│ 2 ┆ {0.910669,0.362536} ┆ {37.555952,0.397746} ┆ {1.407349,0.229106} │\n", "└───────────┴──────────────────────┴──────────────────────┴─────────────────────┘\n" ] } @@ -1894,7 +1894,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (9, 2)
first_digit_cntfirst_digit_distribution
u32f64
5560.1112
5660.1132
5560.1112
5250.105
5920.1184
5200.104
5660.1132
5520.1104
5670.1134
" + "shape: (9, 2)
first_digit_cntfirst_digit_distribution
u32f64
5690.1138
5850.117
5550.111
5330.1066
5450.109
5780.1156
5500.11
5300.106
5550.111
" ], "text/plain": [ "shape: (9, 2)\n", @@ -1903,15 +1903,15 @@ "│ --- ┆ --- │\n", "│ u32 ┆ f64 │\n", "╞═════════════════╪══════════════════════════╡\n", - "│ 556 ┆ 0.1112 │\n", - "│ 566 ┆ 0.1132 │\n", - "│ 556 ┆ 0.1112 │\n", - "│ 525 ┆ 0.105 │\n", - "│ 592 ┆ 0.1184 │\n", - "│ 520 ┆ 0.104 │\n", - "│ 566 ┆ 0.1132 │\n", - "│ 552 ┆ 0.1104 │\n", - "│ 567 ┆ 0.1134 │\n", + "│ 569 ┆ 0.1138 │\n", + "│ 585 ┆ 0.117 │\n", + "│ 555 ┆ 0.111 │\n", + "│ 533 ┆ 0.1066 │\n", + "│ 545 ┆ 0.109 │\n", + "│ 578 ┆ 0.1156 │\n", + "│ 550 ┆ 0.11 │\n", + "│ 530 ┆ 0.106 │\n", + "│ 555 ┆ 0.111 │\n", "└─────────────────┴──────────────────────────┘" ] }, @@ -1977,7 +1977,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 7)
idvar1var2var3rrhnb_l_inf_cnt
u32f64f64f64f64f64u32
00.3477840.130260.3340190.6984917.90958615
10.482210.0509910.7361850.8920897.8234518
20.7866480.6397780.7747210.1342843.5151420
30.9447630.1294090.4603580.7158578.13377816
40.5976980.7476960.8853920.6708412.39268719
" + "shape: (5, 7)
idvar1var2var3rrhnb_l_inf_cnt
u32f64f64f64f64f64u32
00.7208980.9918270.3472260.4676222.15627510
10.1041650.5199440.1578920.8958485.86990716
20.8237540.3400880.0198650.3956992.732510
30.8757170.5778120.8591560.3616230.19327918
40.7668460.0747210.2492870.6667559.58475820
" ], "text/plain": [ "shape: (5, 7)\n", @@ -1986,11 +1986,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ u32 │\n", "╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════════╡\n", - "│ 0 ┆ 0.347784 ┆ 0.13026 ┆ 0.334019 ┆ 0.698491 ┆ 7.909586 ┆ 15 │\n", - "│ 1 ┆ 0.48221 ┆ 0.050991 ┆ 0.736185 ┆ 0.892089 ┆ 7.823451 ┆ 8 │\n", - "│ 2 ┆ 0.786648 ┆ 0.639778 ┆ 0.774721 ┆ 0.134284 ┆ 3.51514 ┆ 20 │\n", - "│ 3 ┆ 0.944763 ┆ 0.129409 ┆ 0.460358 ┆ 0.715857 ┆ 8.133778 ┆ 16 │\n", - "│ 4 ┆ 0.597698 ┆ 0.747696 ┆ 0.885392 ┆ 0.670841 ┆ 2.392687 ┆ 19 │\n", + "│ 0 ┆ 0.720898 ┆ 0.991827 ┆ 0.347226 ┆ 0.467622 ┆ 2.156275 ┆ 10 │\n", + "│ 1 ┆ 0.104165 ┆ 0.519944 ┆ 0.157892 ┆ 0.895848 ┆ 5.869907 ┆ 16 │\n", + "│ 2 ┆ 0.823754 ┆ 0.340088 ┆ 0.019865 ┆ 0.395699 ┆ 2.7325 ┆ 10 │\n", + "│ 3 ┆ 0.875717 ┆ 0.577812 ┆ 0.859156 ┆ 0.361623 ┆ 0.193279 ┆ 18 │\n", + "│ 4 ┆ 0.766846 ┆ 0.074721 ┆ 0.249287 ┆ 0.666755 ┆ 9.584758 ┆ 20 │\n", "└─────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────────┘" ] }, @@ -2027,7 +2027,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 7)
idvar1var2var3rrhnb_l1_r_cnt
u32f64f64f64f64f64u32
00.3477840.130260.3340190.6984917.909586538
10.482210.0509910.7361850.8920897.823451783
20.7866480.6397780.7747210.1342843.5151410
30.9447630.1294090.4603580.7158578.133778389
40.5976980.7476960.8853920.6708412.392687483
" + "shape: (5, 7)
idvar1var2var3rrhnb_l1_r_cnt
u32f64f64f64f64f64u32
00.7208980.9918270.3472260.4676222.156275137
10.1041650.5199440.1578920.8958485.869907760
20.8237540.3400880.0198650.3956992.732588
30.8757170.5778120.8591560.3616230.193279110
40.7668460.0747210.2492870.6667559.584758363
" ], "text/plain": [ "shape: (5, 7)\n", @@ -2036,11 +2036,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ u32 │\n", "╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╪═════════════╡\n", - "│ 0 ┆ 0.347784 ┆ 0.13026 ┆ 0.334019 ┆ 0.698491 ┆ 7.909586 ┆ 538 │\n", - "│ 1 ┆ 0.48221 ┆ 0.050991 ┆ 0.736185 ┆ 0.892089 ┆ 7.823451 ┆ 783 │\n", - "│ 2 ┆ 0.786648 ┆ 0.639778 ┆ 0.774721 ┆ 0.134284 ┆ 3.51514 ┆ 10 │\n", - "│ 3 ┆ 0.944763 ┆ 0.129409 ┆ 0.460358 ┆ 0.715857 ┆ 8.133778 ┆ 389 │\n", - "│ 4 ┆ 0.597698 ┆ 0.747696 ┆ 0.885392 ┆ 0.670841 ┆ 2.392687 ┆ 483 │\n", + "│ 0 ┆ 0.720898 ┆ 0.991827 ┆ 0.347226 ┆ 0.467622 ┆ 2.156275 ┆ 137 │\n", + "│ 1 ┆ 0.104165 ┆ 0.519944 ┆ 0.157892 ┆ 0.895848 ┆ 5.869907 ┆ 760 │\n", + "│ 2 ┆ 0.823754 ┆ 0.340088 ┆ 0.019865 ┆ 0.395699 ┆ 2.7325 ┆ 88 │\n", + "│ 3 ┆ 0.875717 ┆ 0.577812 ┆ 0.859156 ┆ 0.361623 ┆ 0.193279 ┆ 110 │\n", + "│ 4 ┆ 0.766846 ┆ 0.074721 ┆ 0.249287 ┆ 0.666755 ┆ 9.584758 ┆ 363 │\n", "└─────┴──────────┴──────────┴──────────┴──────────┴──────────┴─────────────┘" ] }, @@ -2076,7 +2076,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 7)
idvar1var2var3rrhbest friends
u32f64f64f64f64f64list[u32]
00.3477840.130260.3340190.6984917.909586[0, 502, … 115]
10.482210.0509910.7361850.8920897.823451[1, 1527, … 400]
20.7866480.6397780.7747210.1342843.51514[2, 1430, … 1451]
30.9447630.1294090.4603580.7158578.133778[3, 598, … 711]
40.5976980.7476960.8853920.6708412.392687[4, 650, … 213]
" + "shape: (5, 7)
idvar1var2var3rrhbest friends
u32f64f64f64f64f64list[u32]
00.7208980.9918270.3472260.4676222.156275[0, 466, … 1831]
10.1041650.5199440.1578920.8958485.869907[1, 1982, … 1117]
20.8237540.3400880.0198650.3956992.7325[2, 665, … 612]
30.8757170.5778120.8591560.3616230.193279[3, 377, … 1429]
40.7668460.0747210.2492870.6667559.584758[4, 1358, … 19]
" ], "text/plain": [ "shape: (5, 7)\n", @@ -2085,11 +2085,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ list[u32] │\n", "╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╪═══════════════════╡\n", - "│ 0 ┆ 0.347784 ┆ 0.13026 ┆ 0.334019 ┆ 0.698491 ┆ 7.909586 ┆ [0, 502, … 115] │\n", - "│ 1 ┆ 0.48221 ┆ 0.050991 ┆ 0.736185 ┆ 0.892089 ┆ 7.823451 ┆ [1, 1527, … 400] │\n", - "│ 2 ┆ 0.786648 ┆ 0.639778 ┆ 0.774721 ┆ 0.134284 ┆ 3.51514 ┆ [2, 1430, … 1451] │\n", - "│ 3 ┆ 0.944763 ┆ 0.129409 ┆ 0.460358 ┆ 0.715857 ┆ 8.133778 ┆ [3, 598, … 711] │\n", - "│ 4 ┆ 0.597698 ┆ 0.747696 ┆ 0.885392 ┆ 0.670841 ┆ 2.392687 ┆ [4, 650, … 213] │\n", + "│ 0 ┆ 0.720898 ┆ 0.991827 ┆ 0.347226 ┆ 0.467622 ┆ 2.156275 ┆ [0, 466, … 1831] │\n", + "│ 1 ┆ 0.104165 ┆ 0.519944 ┆ 0.157892 ┆ 0.895848 ┆ 5.869907 ┆ [1, 1982, … 1117] │\n", + "│ 2 ┆ 0.823754 ┆ 0.340088 ┆ 0.019865 ┆ 0.395699 ┆ 2.7325 ┆ [2, 665, … 612] │\n", + "│ 3 ┆ 0.875717 ┆ 0.577812 ┆ 0.859156 ┆ 0.361623 ┆ 0.193279 ┆ [3, 377, … 1429] │\n", + "│ 4 ┆ 0.766846 ┆ 0.074721 ┆ 0.249287 ┆ 0.666755 ┆ 9.584758 ┆ [4, 1358, … 19] │\n", "└─────┴──────────┴──────────┴──────────┴──────────┴──────────┴───────────────────┘" ] }, @@ -2123,17 +2123,17 @@ "output_type": "stream", "text": [ "shape: (5, 3)\n", - "┌─────┬──────────────────┬────────────────────┐\n", - "│ id ┆ best friends ┆ best friends count │\n", - "│ --- ┆ --- ┆ --- │\n", - "│ u32 ┆ list[u32] ┆ u32 │\n", - "╞═════╪══════════════════╪════════════════════╡\n", - "│ 0 ┆ [0, 502, … 875] ┆ 10 │\n", - "│ 1 ┆ [1, 1527, … 400] ┆ 3 │\n", - "│ 2 ┆ [2, 1430, … 549] ┆ 10 │\n", - "│ 3 ┆ [3, 598, … 1768] ┆ 9 │\n", - "│ 4 ┆ [4, 650, … 803] ┆ 6 │\n", - "└─────┴──────────────────┴────────────────────┘\n" + "┌─────┬───────────────────┬────────────────────┐\n", + "│ id ┆ best friends ┆ best friends count │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ u32 ┆ list[u32] ┆ u32 │\n", + "╞═════╪═══════════════════╪════════════════════╡\n", + "│ 0 ┆ [0, 466, … 229] ┆ 5 │\n", + "│ 1 ┆ [1, 1982, … 579] ┆ 6 │\n", + "│ 2 ┆ [2, 665, … 1288] ┆ 7 │\n", + "│ 3 ┆ [3, 377, … 86] ┆ 9 │\n", + "│ 4 ┆ [4, 1358, … 1104] ┆ 15 │\n", + "└─────┴───────────────────┴────────────────────┘\n" ] } ], @@ -2173,7 +2173,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 8)
idvar1var2var3rrhidxdist
u32f64f64f64f64f64list[u32]list[f64]
00.3477840.130260.3340190.6984917.909586[0, 502, … 115][0.0, 0.066443, … 0.072798]
10.482210.0509910.7361850.8920897.823451[1, 1527, … 400][0.0, 0.049926, … 0.063975]
20.7866480.6397780.7747210.1342843.51514[2, 1430, … 1451][0.0, 0.02861, … 0.057878]
30.9447630.1294090.4603580.7158578.133778[3, 598, … 711][0.0, 0.032508, … 0.046937]
40.5976980.7476960.8853920.6708412.392687[4, 650, … 213][0.0, 0.068048, … 0.076969]
" + "shape: (5, 8)
idvar1var2var3rrhidxdist
u32f64f64f64f64f64list[u32]list[f64]
00.7208980.9918270.3472260.4676222.156275[0, 466, … 1831][0.0, 0.059742, … 0.087801]
10.1041650.5199440.1578920.8958485.869907[1, 1982, … 1117][0.0, 0.047833, … 0.070508]
20.8237540.3400880.0198650.3956992.7325[2, 665, … 612][0.0, 0.042865, … 0.061837]
30.8757170.5778120.8591560.3616230.193279[3, 377, … 1429][0.0, 0.070834, … 0.079681]
40.7668460.0747210.2492870.6667559.584758[4, 1358, … 19][0.0, 0.018153, … 0.04832]
" ], "text/plain": [ "shape: (5, 8)\n", @@ -2182,16 +2182,16 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ list[u32] ┆ list[f64] │\n", "╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════════════╪══════════════════╡\n", - "│ 0 ┆ 0.347784 ┆ 0.13026 ┆ 0.334019 ┆ 0.698491 ┆ 7.909586 ┆ [0, 502, … 115] ┆ [0.0, 0.066443, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.072798] │\n", - "│ 1 ┆ 0.48221 ┆ 0.050991 ┆ 0.736185 ┆ 0.892089 ┆ 7.823451 ┆ [1, 1527, … 400] ┆ [0.0, 0.049926, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.063975] │\n", - "│ 2 ┆ 0.786648 ┆ 0.639778 ┆ 0.774721 ┆ 0.134284 ┆ 3.51514 ┆ [2, 1430, … ┆ [0.0, 0.02861, … │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ 1451] ┆ 0.057878] │\n", - "│ 3 ┆ 0.944763 ┆ 0.129409 ┆ 0.460358 ┆ 0.715857 ┆ 8.133778 ┆ [3, 598, … 711] ┆ [0.0, 0.032508, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.046937] │\n", - "│ 4 ┆ 0.597698 ┆ 0.747696 ┆ 0.885392 ┆ 0.670841 ┆ 2.392687 ┆ [4, 650, … 213] ┆ [0.0, 0.068048, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.076969] │\n", + "│ 0 ┆ 0.720898 ┆ 0.991827 ┆ 0.347226 ┆ 0.467622 ┆ 2.156275 ┆ [0, 466, … 1831] ┆ [0.0, 0.059742, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.087801] │\n", + "│ 1 ┆ 0.104165 ┆ 0.519944 ┆ 0.157892 ┆ 0.895848 ┆ 5.869907 ┆ [1, 1982, … ┆ [0.0, 0.047833, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ 1117] ┆ … 0.070508] │\n", + "│ 2 ┆ 0.823754 ┆ 0.340088 ┆ 0.019865 ┆ 0.395699 ┆ 2.7325 ┆ [2, 665, … 612] ┆ [0.0, 0.042865, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.061837] │\n", + "│ 3 ┆ 0.875717 ┆ 0.577812 ┆ 0.859156 ┆ 0.361623 ┆ 0.193279 ┆ [3, 377, … 1429] ┆ [0.0, 0.070834, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.079681] │\n", + "│ 4 ┆ 0.766846 ┆ 0.074721 ┆ 0.249287 ┆ 0.666755 ┆ 9.584758 ┆ [4, 1358, … 19] ┆ [0.0, 0.018153, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.04832] │\n", "└─────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────────────┴──────────────────┘" ] }, @@ -2231,7 +2231,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 6)
idvar1var2var3rrh
u32f64f64f64f64f64
00.3477840.130260.3340190.6984917.909586
20.7866480.6397780.7747210.1342843.51514
50.7126330.284850.3291330.5433386.065003
70.4057690.4433430.8922050.7317089.658069
100.8369910.4285170.4042040.4400194.264234
" + "shape: (5, 6)
idvar1var2var3rrh
u32f64f64f64f64f64
80.4411370.194780.1835940.3982829.18641
120.7349510.6491020.5002230.8833179.904633
150.1080740.4690150.4798990.9184899.804659
160.4858010.5302770.5905740.7061866.602236
200.3602270.7137030.2314480.1382633.325791
" ], "text/plain": [ "shape: (5, 6)\n", @@ -2240,11 +2240,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╡\n", - "│ 0 ┆ 0.347784 ┆ 0.13026 ┆ 0.334019 ┆ 0.698491 ┆ 7.909586 │\n", - "│ 2 ┆ 0.786648 ┆ 0.639778 ┆ 0.774721 ┆ 0.134284 ┆ 3.51514 │\n", - "│ 5 ┆ 0.712633 ┆ 0.28485 ┆ 0.329133 ┆ 0.543338 ┆ 6.065003 │\n", - "│ 7 ┆ 0.405769 ┆ 0.443343 ┆ 0.892205 ┆ 0.731708 ┆ 9.658069 │\n", - "│ 10 ┆ 0.836991 ┆ 0.428517 ┆ 0.404204 ┆ 0.440019 ┆ 4.264234 │\n", + "│ 8 ┆ 0.441137 ┆ 0.19478 ┆ 0.183594 ┆ 0.398282 ┆ 9.18641 │\n", + "│ 12 ┆ 0.734951 ┆ 0.649102 ┆ 0.500223 ┆ 0.883317 ┆ 9.904633 │\n", + "│ 15 ┆ 0.108074 ┆ 0.469015 ┆ 0.479899 ┆ 0.918489 ┆ 9.804659 │\n", + "│ 16 ┆ 0.485801 ┆ 0.530277 ┆ 0.590574 ┆ 0.706186 ┆ 6.602236 │\n", + "│ 20 ┆ 0.360227 ┆ 0.713703 ┆ 0.231448 ┆ 0.138263 ┆ 3.325791 │\n", "└─────┴──────────┴──────────┴──────────┴──────────┴──────────┘" ] }, @@ -2281,7 +2281,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 6)
idvar1var2var3rrh
u32f64f64f64f64f64
610.4575860.5075030.6142960.558337.064511
1640.5505850.5645620.9172420.7561738.670795
2570.5712680.544020.1108260.167148.900079
3340.5513340.4582450.403990.4943471.872597
3520.5353940.4594570.4672950.9867023.193573
" + "shape: (5, 6)
idvar1var2var3rrh
u32f64f64f64f64f64
160.4858010.5302770.5905740.7061866.602236
1720.4860640.4346350.2665460.2995133.402964
1780.5017630.5223130.9349250.7096871.89558
1840.5431180.5296060.5458160.06283.587051
1890.4940740.5714090.4140450.3301026.852999
" ], "text/plain": [ "shape: (5, 6)\n", @@ -2290,11 +2290,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╡\n", - "│ 61 ┆ 0.457586 ┆ 0.507503 ┆ 0.614296 ┆ 0.55833 ┆ 7.064511 │\n", - "│ 164 ┆ 0.550585 ┆ 0.564562 ┆ 0.917242 ┆ 0.756173 ┆ 8.670795 │\n", - "│ 257 ┆ 0.571268 ┆ 0.54402 ┆ 0.110826 ┆ 0.16714 ┆ 8.900079 │\n", - "│ 334 ┆ 0.551334 ┆ 0.458245 ┆ 0.40399 ┆ 0.494347 ┆ 1.872597 │\n", - "│ 352 ┆ 0.535394 ┆ 0.459457 ┆ 0.467295 ┆ 0.986702 ┆ 3.193573 │\n", + "│ 16 ┆ 0.485801 ┆ 0.530277 ┆ 0.590574 ┆ 0.706186 ┆ 6.602236 │\n", + "│ 172 ┆ 0.486064 ┆ 0.434635 ┆ 0.266546 ┆ 0.299513 ┆ 3.402964 │\n", + "│ 178 ┆ 0.501763 ┆ 0.522313 ┆ 0.934925 ┆ 0.709687 ┆ 1.89558 │\n", + "│ 184 ┆ 0.543118 ┆ 0.529606 ┆ 0.545816 ┆ 0.0628 ┆ 3.587051 │\n", + "│ 189 ┆ 0.494074 ┆ 0.571409 ┆ 0.414045 ┆ 0.330102 ┆ 6.852999 │\n", "└─────┴──────────┴──────────┴──────────┴──────────┴──────────┘" ] }, @@ -2331,7 +2331,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 6)
idvar1var2var3rrh
u32f64f64f64f64f64
610.4575860.5075030.6142960.558337.064511
3540.4429610.5194260.6919720.944377.440443
4060.5204020.4435650.0155720.8146729.903239
4110.5230370.5695130.9751170.1634148.785701
4880.45470.4532890.3886350.3912559.463455
" + "shape: (5, 6)
idvar1var2var3rrh
u32f64f64f64f64f64
160.4858010.5302770.5905740.7061866.602236
3030.5027670.516030.9780960.7381669.051765
4180.5086160.4762170.2211880.4215267.903868
4670.4896570.4831750.5235530.0938097.757587
4810.5166880.5505730.8661260.8485459.566443
" ], "text/plain": [ "shape: (5, 6)\n", @@ -2340,11 +2340,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╡\n", - "│ 61 ┆ 0.457586 ┆ 0.507503 ┆ 0.614296 ┆ 0.55833 ┆ 7.064511 │\n", - "│ 354 ┆ 0.442961 ┆ 0.519426 ┆ 0.691972 ┆ 0.94437 ┆ 7.440443 │\n", - "│ 406 ┆ 0.520402 ┆ 0.443565 ┆ 0.015572 ┆ 0.814672 ┆ 9.903239 │\n", - "│ 411 ┆ 0.523037 ┆ 0.569513 ┆ 0.975117 ┆ 0.163414 ┆ 8.785701 │\n", - "│ 488 ┆ 0.4547 ┆ 0.453289 ┆ 0.388635 ┆ 0.391255 ┆ 9.463455 │\n", + "│ 16 ┆ 0.485801 ┆ 0.530277 ┆ 0.590574 ┆ 0.706186 ┆ 6.602236 │\n", + "│ 303 ┆ 0.502767 ┆ 0.51603 ┆ 0.978096 ┆ 0.738166 ┆ 9.051765 │\n", + "│ 418 ┆ 0.508616 ┆ 0.476217 ┆ 0.221188 ┆ 0.421526 ┆ 7.903868 │\n", + "│ 467 ┆ 0.489657 ┆ 0.483175 ┆ 0.523553 ┆ 0.093809 ┆ 7.757587 │\n", + "│ 481 ┆ 0.516688 ┆ 0.550573 ┆ 0.866126 ┆ 0.848545 ┆ 9.566443 │\n", "└─────┴──────────┴──────────┴──────────┴──────────┴──────────┘" ] }, @@ -2381,7 +2381,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 3)
idfriendscount
u64list[u32]u32
0[0, 459, … 1058]6
1[1]1
2[2, 1077]2
3[3, 104]2
4[4, 781, … 650]4
" + "shape: (5, 3)
idfriendscount
u64list[u32]u32
0[0, 1421, 1777]3
1[1, 17, … 842]4
2[2, 1857, … 665]4
3[3, 1868, 1468]3
4[4, 982, … 774]8
" ], "text/plain": [ "shape: (5, 3)\n", @@ -2390,11 +2390,11 @@ "│ --- ┆ --- ┆ --- │\n", "│ u64 ┆ list[u32] ┆ u32 │\n", "╞═════╪══════════════════╪═══════╡\n", - "│ 0 ┆ [0, 459, … 1058] ┆ 6 │\n", - "│ 1 ┆ [1] ┆ 1 │\n", - "│ 2 ┆ [2, 1077] ┆ 2 │\n", - "│ 3 ┆ [3, 104] ┆ 2 │\n", - "│ 4 ┆ [4, 781, … 650] ┆ 4 │\n", + "│ 0 ┆ [0, 1421, 1777] ┆ 3 │\n", + "│ 1 ┆ [1, 17, … 842] ┆ 4 │\n", + "│ 2 ┆ [2, 1857, … 665] ┆ 4 │\n", + "│ 3 ┆ [3, 1868, 1468] ┆ 3 │\n", + "│ 4 ┆ [4, 982, … 774] ┆ 8 │\n", "└─────┴──────────────────┴───────┘" ] }, @@ -2439,7 +2439,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_28864/3354819425.py:3: UserWarning: The compatibility layer is considered experimental.\n", + "/tmp/ipykernel_23937/3354819425.py:3: UserWarning: The compatibility layer is considered experimental.\n", " from polars_ds.compat import compat as pds2\n" ] }, @@ -2453,7 +2453,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 6)
actualpredicted0-20-9s1s2
f64f64i32i32strstr
1.00.96565326"I0""nR"
0.00.4403722"6""Mz"
1.00.93195512"1""kg"
0.00.55819714"R6""m"
1.00.23553521"bF""RO"
" + "shape: (5, 6)
actualpredicted0-20-9s1s2
f64f64i32i32strstr
0.00.19248222"Mk""DR"
1.00.01068128"en""9"
1.00.09600606"R""Q"
1.00.23889902"m""t"
0.00.45135518"x8""a"
" ], "text/plain": [ "shape: (5, 6)\n", @@ -2462,11 +2462,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ i32 ┆ i32 ┆ str ┆ str │\n", "╞════════╪═══════════╪═════╪═════╪═════╪═════╡\n", - "│ 1.0 ┆ 0.965653 ┆ 2 ┆ 6 ┆ I0 ┆ nR │\n", - "│ 0.0 ┆ 0.44037 ┆ 2 ┆ 2 ┆ 6 ┆ Mz │\n", - "│ 1.0 ┆ 0.931955 ┆ 1 ┆ 2 ┆ 1 ┆ kg │\n", - "│ 0.0 ┆ 0.558197 ┆ 1 ┆ 4 ┆ R6 ┆ m │\n", - "│ 1.0 ┆ 0.235535 ┆ 2 ┆ 1 ┆ bF ┆ RO │\n", + "│ 0.0 ┆ 0.192482 ┆ 2 ┆ 2 ┆ Mk ┆ DR │\n", + "│ 1.0 ┆ 0.010681 ┆ 2 ┆ 8 ┆ en ┆ 9 │\n", + "│ 1.0 ┆ 0.096006 ┆ 0 ┆ 6 ┆ R ┆ Q │\n", + "│ 1.0 ┆ 0.238899 ┆ 0 ┆ 2 ┆ m ┆ t │\n", + "│ 0.0 ┆ 0.451355 ┆ 1 ┆ 8 ┆ x8 ┆ a │\n", "└────────┴───────────┴─────┴─────┴─────┴─────┘" ] }, @@ -2553,7 +2553,7 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═══════════╪══════════╪══════════╪═══════════════════╪══════════╡\n", - "│ 0.499669 ┆ 0.499599 ┆ 0.499634 ┆ 0.497488 ┆ 0.498677 │\n", + "│ 0.497157 ┆ 0.496392 ┆ 0.496774 ┆ 0.498906 ┆ 0.496817 │\n", "└───────────┴──────────┴──────────┴───────────────────┴──────────┘\n", "shape: (1, 5)\n", "┌───────────┬──────────┬──────────┬───────────────────┬──────────┐\n", @@ -2561,7 +2561,7 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═══════════╪══════════╪══════════╪═══════════════════╪══════════╡\n", - "│ 0.499669 ┆ 0.499599 ┆ 0.499634 ┆ 0.497488 ┆ 0.498677 │\n", + "│ 0.497157 ┆ 0.496392 ┆ 0.496774 ┆ 0.498906 ┆ 0.496817 │\n", "└───────────┴──────────┴──────────┴───────────────────┴──────────┘\n", "shape: (1, 5)\n", "┌───────────┬──────────┬──────────┬───────────────────┬──────────┐\n", @@ -2569,7 +2569,7 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═══════════╪══════════╪══════════╪═══════════════════╪══════════╡\n", - "│ 0.499669 ┆ 0.499599 ┆ 0.499634 ┆ 0.497488 ┆ 0.498677 │\n", + "│ 0.497157 ┆ 0.496392 ┆ 0.496774 ┆ 0.498906 ┆ 0.496817 │\n", "└───────────┴──────────┴──────────┴───────────────────┴──────────┘\n" ] } @@ -2599,7 +2599,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 4)
<=baseline_pctactual_pctpsi_bin
f64f64f64f64
0.2113040.20.2150.001085
0.3997050.20.1950.000127
0.5946050.20.1870.000874
0.7973540.20.1890.000622
inf0.20.2140.000947
" + "shape: (5, 4)
<=baseline_pctactual_pctpsi_bin
f64f64f64f64
0.1983330.20.2160.001231
0.4123380.20.2180.001551
0.5972730.20.1740.003621
0.7846260.20.1840.001334
inf0.20.2080.000314
" ], "text/plain": [ "shape: (5, 4)\n", @@ -2608,11 +2608,11 @@ "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞══════════╪══════════════╪════════════╪══════════╡\n", - "│ 0.211304 ┆ 0.2 ┆ 0.215 ┆ 0.001085 │\n", - "│ 0.399705 ┆ 0.2 ┆ 0.195 ┆ 0.000127 │\n", - "│ 0.594605 ┆ 0.2 ┆ 0.187 ┆ 0.000874 │\n", - "│ 0.797354 ┆ 0.2 ┆ 0.189 ┆ 0.000622 │\n", - "│ inf ┆ 0.2 ┆ 0.214 ┆ 0.000947 │\n", + "│ 0.198333 ┆ 0.2 ┆ 0.216 ┆ 0.001231 │\n", + "│ 0.412338 ┆ 0.2 ┆ 0.218 ┆ 0.001551 │\n", + "│ 0.597273 ┆ 0.2 ┆ 0.174 ┆ 0.003621 │\n", + "│ 0.784626 ┆ 0.2 ┆ 0.184 ┆ 0.001334 │\n", + "│ inf ┆ 0.2 ┆ 0.208 ┆ 0.000314 │\n", "└──────────┴──────────────┴────────────┴──────────┘" ] }, @@ -2647,13 +2647,13 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1,)
cid_ce
f64
13.128145
" + "shape: (1,)
cid_ce
f64
13.47064
" ], "text/plain": [ "shape: (1,)\n", "Series: 'cid_ce' [f64]\n", "[\n", - "\t13.128145\n", + "\t13.47064\n", "]" ] }, @@ -2684,13 +2684,13 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1,)
c3_stats
f64
0.11619
" + "shape: (1,)
c3_stats
f64
0.141005
" ], "text/plain": [ "shape: (1,)\n", "Series: 'c3_stats' [f64]\n", "[\n", - "\t0.11619\n", + "\t0.141005\n", "]" ] }, @@ -2722,7 +2722,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (10,)
str_leven
u32
2
2
2
2
2
2
2
1
2
2
" + "shape: (10,)
str_leven
u32
2
2
1
1
2
1
1
2
2
1
" ], "text/plain": [ "shape: (10,)\n", @@ -2730,14 +2730,14 @@ "[\n", "\t2\n", "\t2\n", + "\t1\n", + "\t1\n", "\t2\n", - "\t2\n", - "\t2\n", - "\t2\n", - "\t2\n", + "\t1\n", "\t1\n", "\t2\n", "\t2\n", + "\t1\n", "]" ] }, @@ -2775,7 +2775,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.7" + "version": "3.13.1" } }, "nbformat": 4, diff --git a/examples/diagnosis.ipynb b/examples/diagnosis.ipynb deleted file mode 100644 index 44cb1b52..00000000 --- a/examples/diagnosis.ipynb +++ /dev/null @@ -1,2971 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Diagnosis and DIA (Data Inspection Assistant)\n", - "\n", - "If you cannot import this module, please try: pip install \"polars_ds[plot]\"\n", - "\n", - "The dataset used for dependency detection can be found on github, at examples/dependency.parquet\n", - "\n", - "The plots cannot be rendered on github. Currently, the plot backend is Altair but this is subject\n", - "to change depending on which plotting backend supports Polars more natively." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import polars as pl\n", - "import polars_ds as pds\n", - "from polars_ds.diagnosis import DIA" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (5, 6)
uniform_1uniform_2expnormalfat_normallist_prob
f64f64f64f64f64list[f64]
5.4317510.790414.892053-0.289251-746.492553[0.79041, 0.20959]
8.6248510.4510471.047045-0.775272425.558072[0.451047, 0.548953]
7.4731750.0129273.3718660.454066-424.907189[0.012927, 0.987073]
7.251870.644817.200579-1.113124853.406457[0.64481, 0.35519]
10.8216980.1074212.2093820.4001031219.872708[0.107421, 0.892579]
" - ], - "text/plain": [ - "shape: (5, 6)\n", - "┌───────────┬───────────┬──────────┬───────────┬─────────────┬──────────────────────┐\n", - "│ uniform_1 ┆ uniform_2 ┆ exp ┆ normal ┆ fat_normal ┆ list_prob │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ list[f64] │\n", - "╞═══════════╪═══════════╪══════════╪═══════════╪═════════════╪══════════════════════╡\n", - "│ 5.431751 ┆ 0.79041 ┆ 4.892053 ┆ -0.289251 ┆ -746.492553 ┆ [0.79041, 0.20959] │\n", - "│ 8.624851 ┆ 0.451047 ┆ 1.047045 ┆ -0.775272 ┆ 425.558072 ┆ [0.451047, 0.548953] │\n", - "│ 7.473175 ┆ 0.012927 ┆ 3.371866 ┆ 0.454066 ┆ -424.907189 ┆ [0.012927, 0.987073] │\n", - "│ 7.25187 ┆ 0.64481 ┆ 7.200579 ┆ -1.113124 ┆ 853.406457 ┆ [0.64481, 0.35519] │\n", - "│ 10.821698 ┆ 0.107421 ┆ 2.209382 ┆ 0.400103 ┆ 1219.872708 ┆ [0.107421, 0.892579] │\n", - "└───────────┴───────────┴──────────┴───────────┴─────────────┴──────────────────────┘" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pds.frame(size=1_000_000).select(\n", - " pds.random(0.0, 12.0).alias(\"uniform_1\"),\n", - " pds.random(0.0, 1.0).alias(\"uniform_2\"),\n", - " pds.random_exp(0.5).alias(\"exp\"),\n", - " pds.random_normal(0.0, 1.0).alias(\"normal\"),\n", - " pds.random_normal(0.0, 1000.0).alias(\"fat_normal\"),\n", - ").with_columns(\n", - " pl.concat_list(\"uniform_2\", 1 - pl.col(\"uniform_2\")).alias(\"list_prob\")\n", - ")\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "dia = DIA(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (5, 7)
columnnull_countnull%NaN_countNaN%inf_countInf%
stru32f64u32f64u32f64
"uniform_1"00.000.000.0
"uniform_2"00.000.000.0
"exp"00.000.000.0
"normal"00.000.000.0
"fat_normal"00.000.000.0
" - ], - "text/plain": [ - "shape: (5, 7)\n", - "┌────────────┬────────────┬───────┬───────────┬──────┬───────────┬──────┐\n", - "│ column ┆ null_count ┆ null% ┆ NaN_count ┆ NaN% ┆ inf_count ┆ Inf% │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ u32 ┆ f64 ┆ u32 ┆ f64 ┆ u32 ┆ f64 │\n", - "╞════════════╪════════════╪═══════╪═══════════╪══════╪═══════════╪══════╡\n", - "│ uniform_1 ┆ 0 ┆ 0.0 ┆ 0 ┆ 0.0 ┆ 0 ┆ 0.0 │\n", - "│ uniform_2 ┆ 0 ┆ 0.0 ┆ 0 ┆ 0.0 ┆ 0 ┆ 0.0 │\n", - "│ exp ┆ 0 ┆ 0.0 ┆ 0 ┆ 0.0 ┆ 0 ┆ 0.0 │\n", - "│ normal ┆ 0 ┆ 0.0 ┆ 0 ┆ 0.0 ┆ 0 ┆ 0.0 │\n", - "│ fat_normal ┆ 0 ┆ 0.0 ┆ 0 ┆ 0.0 ┆ 0 ┆ 0.0 │\n", - "└────────────┴────────────┴───────┴───────────┴──────┴───────────┴──────┘" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dia.special_values_report()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "
columnnon_null_cntnull%meanstdminq1medianq3maxIQRoutlier_cnthistogram
uniform_110000000.00%6.0013.4650.0003.0026.0009.00212.0006.0000
50.3K050.3K49.7K50.1K50.0K49.8K49.9K50.3K50.0K49.9K49.9K50.2K49.8K50.1K49.8K50.1K49.8K50.3K50.1K50.0K50.1K
uniform_210000000.00%0.5000.2890.0000.2510.5000.7501.0000.4990
50.4K050.2K50.2K50.0K49.4K49.4K50.4K50.1K50.2K49.9K49.9K50.1K49.9K49.9K50.0K50.1K50.1K49.9K49.9K50.3K50.1K
exp10000000.00%2.0012.0020.0000.5751.3902.77526.3102.19948090
482K0482K249K129K67.4K34.7K17.8K9.23K4.87K2.52K1.31K69735818589502512424
normal10000000.00%−0.0001.002−4.769−0.6750.0000.6754.5981.3497074
183K06393001.54K5.87K17.6K43.1K85.3K136K176K183K153K104K56.6K25.1K8.94K2.56K5931069
fat_normal10000000.00%−1.2351,000.559−4,761.854−676.352−1.710674.7475,460.6601,351.1006898
200K08785622.62K10.4K31.4K73.5K132K185K200K168K110K55.0K21.6K6.47K1.56K2584721
\n", - "\n", - "
\n", - " " - ], - "text/plain": [ - "GT(_tbl_data=shape: (5, 13)\n", - "┌────────────┬────────────┬───────┬───────────┬───┬────────────┬───────────┬───────────┬───────────┐\n", - "│ column ┆ non_null_c ┆ null% ┆ mean ┆ … ┆ max ┆ IQR ┆ outlier_c ┆ histogram │\n", - "│ --- ┆ nt ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ nt ┆ --- │\n", - "│ str ┆ --- ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ --- ┆ struct[1] │\n", - "│ ┆ u32 ┆ ┆ ┆ ┆ ┆ ┆ u32 ┆ │\n", - "╞════════════╪════════════╪═══════╪═══════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡\n", - "│ uniform_1 ┆ 1000000 ┆ 0.0 ┆ 6.001199 ┆ … ┆ 11.999983 ┆ 6.000085 ┆ 0 ┆ {[50302, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 49668, … │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 50075]} │\n", - "│ uniform_2 ┆ 1000000 ┆ 0.0 ┆ 0.500235 ┆ … ┆ 1.0 ┆ 0.499427 ┆ 0 ┆ {[50195, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 50158, … │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 50139]} │\n", - "│ exp ┆ 1000000 ┆ 0.0 ┆ 2.001267 ┆ … ┆ 26.309787 ┆ 2.199394 ┆ 48090 ┆ {[482093, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 249159, … │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 4]} │\n", - "│ normal ┆ 1000000 ┆ 0.0 ┆ -0.00035 ┆ … ┆ 4.597818 ┆ 1.349325 ┆ 7074 ┆ {[6, 39, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 9]} │\n", - "│ fat_normal ┆ 1000000 ┆ 0.0 ┆ -1.235498 ┆ … ┆ 5460.66002 ┆ 1351.0997 ┆ 6898 ┆ {[8, 78, │\n", - "│ ┆ ┆ ┆ ┆ ┆ 7 ┆ 66 ┆ ┆ … 1]} │\n", - "└────────────┴────────────┴───────┴───────────┴───┴────────────┴───────────┴───────────┴───────────┘, _body=, _boxhead=Boxhead([ColInfo(var='column', type=, column_label='column', column_align='left', column_width=None), ColInfo(var='non_null_cnt', type=, column_label='non_null_cnt', column_align='center', column_width=None), ColInfo(var='null%', type=, column_label='null%', column_align='right', column_width=None), ColInfo(var='mean', type=, column_label='mean', column_align='right', column_width=None), ColInfo(var='std', type=, column_label='std', column_align='right', column_width=None), ColInfo(var='min', type=, column_label='min', column_align='right', column_width=None), ColInfo(var='q1', type=, column_label='q1', column_align='right', column_width=None), ColInfo(var='median', type=, column_label='median', column_align='right', column_width=None), ColInfo(var='q3', type=, column_label='q3', column_align='right', column_width=None), ColInfo(var='max', type=, column_label='max', column_align='right', column_width=None), ColInfo(var='IQR', type=, column_label='IQR', column_align='right', column_width=None), ColInfo(var='outlier_cnt', type=, column_label='outlier_cnt', column_align='center', column_width=None), ColInfo(var='histogram', type=, column_label='histogram', column_align='center', column_width=None)]), _stub=, _spanners=Spanners([]), _heading=Heading(title=None, subtitle=None, preheader=None), _stubhead='column', _source_notes=[], _footnotes=[], _styles=[], _locale=, _formats=[, , ], _substitutions=[], _options=Options(table_id=OptionsInfo(scss=False, category='table', type='value', value=None), table_caption=OptionsInfo(scss=False, category='table', type='value', value=None), table_width=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_layout=OptionsInfo(scss=True, category='table', type='value', value='fixed'), table_margin_left=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_margin_right=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_background_color=OptionsInfo(scss=True, category='table', type='value', value='#FFFFFF'), table_additional_css=OptionsInfo(scss=False, category='table', type='values', value=[]), table_font_names=OptionsInfo(scss=False, category='table', type='values', value=['-apple-system', 'BlinkMacSystemFont', 'Segoe UI', 'Roboto', 'Oxygen', 'Ubuntu', 'Cantarell', 'Helvetica Neue', 'Fira Sans', 'Droid Sans', 'Arial', 'sans-serif']), table_font_size=OptionsInfo(scss=True, category='table', type='px', value='16px'), table_font_weight=OptionsInfo(scss=True, category='table', type='value', value='normal'), table_font_style=OptionsInfo(scss=True, category='table', type='value', value='normal'), table_font_color=OptionsInfo(scss=True, category='table', type='value', value='#333333'), table_font_color_light=OptionsInfo(scss=True, category='table', type='value', value='#FFFFFF'), table_border_top_include=OptionsInfo(scss=False, category='table', type='boolean', value=True), table_border_top_style=OptionsInfo(scss=True, category='table', type='value', value='solid'), table_border_top_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_top_color=OptionsInfo(scss=True, category='table', type='value', value='#A8A8A8'), table_border_right_style=OptionsInfo(scss=True, category='table', type='value', value='none'), table_border_right_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_right_color=OptionsInfo(scss=True, category='table', type='value', value='#D3D3D3'), table_border_bottom_include=OptionsInfo(scss=False, category='table', type='boolean', value=True), table_border_bottom_style=OptionsInfo(scss=True, category='table', type='value', value='solid'), table_border_bottom_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_bottom_color=OptionsInfo(scss=True, category='table', type='value', value='#A8A8A8'), table_border_left_style=OptionsInfo(scss=True, category='table', type='value', value='none'), table_border_left_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_left_color=OptionsInfo(scss=True, category='table', type='value', value='#D3D3D3'), heading_background_color=OptionsInfo(scss=True, category='heading', type='value', value=None), heading_align=OptionsInfo(scss=True, category='heading', type='value', value='center'), heading_title_font_size=OptionsInfo(scss=True, category='heading', type='px', value='125%'), heading_title_font_weight=OptionsInfo(scss=True, category='heading', type='value', value='initial'), heading_subtitle_font_size=OptionsInfo(scss=True, category='heading', type='px', value='85%'), heading_subtitle_font_weight=OptionsInfo(scss=True, category='heading', type='value', value='initial'), heading_padding=OptionsInfo(scss=True, category='heading', type='px', value='4px'), heading_padding_horizontal=OptionsInfo(scss=True, category='heading', type='px', value='5px'), heading_border_bottom_style=OptionsInfo(scss=True, category='heading', type='value', value='solid'), heading_border_bottom_width=OptionsInfo(scss=True, category='heading', type='px', value='2px'), heading_border_bottom_color=OptionsInfo(scss=True, category='heading', type='value', value='#D3D3D3'), heading_border_lr_style=OptionsInfo(scss=True, category='heading', type='value', value='none'), heading_border_lr_width=OptionsInfo(scss=True, category='heading', type='px', value='1px'), heading_border_lr_color=OptionsInfo(scss=True, category='heading', type='value', value='#D3D3D3'), column_labels_background_color=OptionsInfo(scss=True, category='column_labels', type='value', value=None), column_labels_font_size=OptionsInfo(scss=True, category='column_labels', type='px', value='100%'), column_labels_font_weight=OptionsInfo(scss=True, category='column_labels', type='value', value='normal'), column_labels_text_transform=OptionsInfo(scss=True, category='column_labels', type='value', value='inherit'), column_labels_padding=OptionsInfo(scss=True, category='column_labels', type='px', value='5px'), column_labels_padding_horizontal=OptionsInfo(scss=True, category='column_labels', type='px', value='5px'), column_labels_vlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='none'), column_labels_vlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), column_labels_vlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), column_labels_border_top_style=OptionsInfo(scss=True, category='column_labels', type='value', value='solid'), column_labels_border_top_width=OptionsInfo(scss=True, category='column_labels', type='px', value='2px'), column_labels_border_top_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_border_bottom_style=OptionsInfo(scss=True, category='column_labels', type='value', value='solid'), column_labels_border_bottom_width=OptionsInfo(scss=True, category='column_labels', type='px', value='2px'), column_labels_border_bottom_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_border_lr_style=OptionsInfo(scss=True, category='column_labels', type='value', value='none'), column_labels_border_lr_width=OptionsInfo(scss=True, category='column_labels', type='px', value='1px'), column_labels_border_lr_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_hidden=OptionsInfo(scss=False, category='column_labels', type='boolean', value=False), row_group_background_color=OptionsInfo(scss=True, category='row_group', type='value', value=None), row_group_font_size=OptionsInfo(scss=True, category='row_group', type='px', value='100%'), row_group_font_weight=OptionsInfo(scss=True, category='row_group', type='value', value='initial'), row_group_text_transform=OptionsInfo(scss=True, category='row_group', type='value', value='inherit'), row_group_padding=OptionsInfo(scss=True, category='row_group', type='px', value='8px'), row_group_padding_horizontal=OptionsInfo(scss=True, category='row_group', type='px', value='5px'), row_group_border_top_style=OptionsInfo(scss=True, category='row_group', type='value', value='solid'), row_group_border_top_width=OptionsInfo(scss=True, category='row_group', type='px', value='2px'), row_group_border_top_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_right_style=OptionsInfo(scss=True, category='row_group', type='value', value='none'), row_group_border_right_width=OptionsInfo(scss=True, category='row_group', type='px', value='1px'), row_group_border_right_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_bottom_style=OptionsInfo(scss=True, category='row_group', type='value', value='solid'), row_group_border_bottom_width=OptionsInfo(scss=True, category='row_group', type='px', value='2px'), row_group_border_bottom_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_left_style=OptionsInfo(scss=True, category='row_group', type='value', value='none'), row_group_border_left_width=OptionsInfo(scss=True, category='row_group', type='px', value='1px'), row_group_border_left_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_as_column=OptionsInfo(scss=False, category='row_group', type='boolean', value=False), table_body_hlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_hlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), table_body_hlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_vlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='none'), table_body_vlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), table_body_vlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_border_top_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_border_top_width=OptionsInfo(scss=True, category='table_body', type='px', value='2px'), table_body_border_top_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_border_bottom_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_border_bottom_width=OptionsInfo(scss=True, category='table_body', type='px', value='2px'), table_body_border_bottom_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), data_row_padding=OptionsInfo(scss=True, category='data_row', type='px', value='8px'), data_row_padding_horizontal=OptionsInfo(scss=True, category='data_row', type='px', value='5px'), stub_background_color=OptionsInfo(scss=True, category='stub', type='value', value=None), stub_font_size=OptionsInfo(scss=True, category='stub', type='px', value='100%'), stub_font_weight=OptionsInfo(scss=True, category='stub', type='value', value='initial'), stub_text_transform=OptionsInfo(scss=True, category='stub', type='value', value='inherit'), stub_border_style=OptionsInfo(scss=True, category='stub', type='value', value='solid'), stub_border_width=OptionsInfo(scss=True, category='stub', type='px', value='2px'), stub_border_color=OptionsInfo(scss=True, category='stub', type='value', value='#D3D3D3'), stub_row_group_background_color=OptionsInfo(scss=True, category='stub', type='value', value=None), stub_row_group_font_size=OptionsInfo(scss=True, category='stub', type='px', value='100%'), stub_row_group_font_weight=OptionsInfo(scss=True, category='stub', type='value', value='initial'), stub_row_group_text_transform=OptionsInfo(scss=True, category='stub', type='value', value='inherit'), stub_row_group_border_style=OptionsInfo(scss=True, category='stub', type='value', value='solid'), stub_row_group_border_width=OptionsInfo(scss=True, category='stub', type='px', value='2px'), stub_row_group_border_color=OptionsInfo(scss=True, category='stub', type='value', value='#D3D3D3'), source_notes_padding=OptionsInfo(scss=True, category='source_notes', type='px', value='4px'), source_notes_padding_horizontal=OptionsInfo(scss=True, category='source_notes', type='px', value='5px'), source_notes_background_color=OptionsInfo(scss=True, category='source_notes', type='value', value=None), source_notes_font_size=OptionsInfo(scss=True, category='source_notes', type='px', value='90%'), source_notes_border_bottom_style=OptionsInfo(scss=True, category='source_notes', type='value', value='none'), source_notes_border_bottom_width=OptionsInfo(scss=True, category='source_notes', type='px', value='2px'), source_notes_border_bottom_color=OptionsInfo(scss=True, category='source_notes', type='value', value='#D3D3D3'), source_notes_border_lr_style=OptionsInfo(scss=True, category='source_notes', type='value', value='none'), source_notes_border_lr_width=OptionsInfo(scss=True, category='source_notes', type='px', value='2px'), source_notes_border_lr_color=OptionsInfo(scss=True, category='source_notes', type='value', value='#D3D3D3'), source_notes_multiline=OptionsInfo(scss=False, category='source_notes', type='boolean', value=True), source_notes_sep=OptionsInfo(scss=False, category='source_notes', type='value', value=' '), row_striping_background_color=OptionsInfo(scss=True, category='row', type='value', value='rgba(128,128,128,0.05)'), row_striping_include_stub=OptionsInfo(scss=False, category='row', type='boolean', value=False), row_striping_include_table_body=OptionsInfo(scss=False, category='row', type='boolean', value=False), container_width=OptionsInfo(scss=False, category='container', type='px', value='auto'), container_height=OptionsInfo(scss=False, category='container', type='px', value='auto'), container_padding_x=OptionsInfo(scss=False, category='container', type='px', value='0px'), container_padding_y=OptionsInfo(scss=False, category='container', type='px', value='10px'), container_overflow_x=OptionsInfo(scss=False, category='container', type='overflow', value='auto'), container_overflow_y=OptionsInfo(scss=False, category='container', type='overflow', value='auto'), quarto_disable_processing=OptionsInfo(scss=False, category='quarto', type='logical', value=False), quarto_use_bootstrap=OptionsInfo(scss=False, category='quarto', type='logical', value=False)), _has_built=False)" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Only shows for numerical columns\n", - "dia.numeric_profile(histogram=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (5, 12)
columnnon_null_cntnull%meanstdminq1medianq3maxIQRoutlier_cnt
stru32f64f64f64f64f64f64f64f64f64u32
"uniform_1"10000000.06.0011993.4649810.0000113.0023376.09.00242211.9999836.0000850
"uniform_2"10000000.00.5002350.2887830.0000030.2508750.50.7503021.00.4994270
"exp"10000000.02.0012672.0017980.0000040.5754371.392.77483126.3097872.19939448090
"normal"10000000.0-0.000351.001717-4.769409-0.674721-0.00.6746044.5978181.3493257074
"fat_normal"10000000.0-1.2354981000.558587-4761.854319-676.352382-1.71674.7473835460.6600271351.0997666898
" - ], - "text/plain": [ - "shape: (5, 12)\n", - "┌────────────┬────────────┬───────┬───────────┬───┬────────────┬───────────┬───────────┬───────────┐\n", - "│ column ┆ non_null_c ┆ null% ┆ mean ┆ … ┆ q3 ┆ max ┆ IQR ┆ outlier_c │\n", - "│ --- ┆ nt ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ nt │\n", - "│ str ┆ --- ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ --- │\n", - "│ ┆ u32 ┆ ┆ ┆ ┆ ┆ ┆ ┆ u32 │\n", - "╞════════════╪════════════╪═══════╪═══════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡\n", - "│ uniform_1 ┆ 1000000 ┆ 0.0 ┆ 6.001199 ┆ … ┆ 9.002422 ┆ 11.999983 ┆ 6.000085 ┆ 0 │\n", - "│ uniform_2 ┆ 1000000 ┆ 0.0 ┆ 0.500235 ┆ … ┆ 0.750302 ┆ 1.0 ┆ 0.499427 ┆ 0 │\n", - "│ exp ┆ 1000000 ┆ 0.0 ┆ 2.001267 ┆ … ┆ 2.774831 ┆ 26.309787 ┆ 2.199394 ┆ 48090 │\n", - "│ normal ┆ 1000000 ┆ 0.0 ┆ -0.00035 ┆ … ┆ 0.674604 ┆ 4.597818 ┆ 1.349325 ┆ 7074 │\n", - "│ fat_normal ┆ 1000000 ┆ 0.0 ┆ -1.235498 ┆ … ┆ 674.747383 ┆ 5460.6600 ┆ 1351.0997 ┆ 6898 │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ 27 ┆ 66 ┆ │\n", - "└────────────┴────────────┴───────┴───────────┴───┴────────────┴───────────┴───────────┴───────────┘" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Don't compute histogram. Use Polars as output format instead of GT\n", - "dia.numeric_profile(histogram=False, gt=False) " - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shape: (10, 3)\n", - "┌───────────┬────────────┬───────────┐\n", - "│ x ┆ y ┆ corr │\n", - "│ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ f64 │\n", - "╞═══════════╪════════════╪═══════════╡\n", - "│ exp ┆ fat_normal ┆ 0.002338 │\n", - "│ exp ┆ normal ┆ -0.002172 │\n", - "│ uniform_1 ┆ exp ┆ -0.001704 │\n", - "│ uniform_1 ┆ uniform_2 ┆ 0.001595 │\n", - "│ normal ┆ fat_normal ┆ -0.001075 │\n", - "│ uniform_2 ┆ fat_normal ┆ 0.001054 │\n", - "│ uniform_1 ┆ fat_normal ┆ 0.000741 │\n", - "│ uniform_2 ┆ normal ┆ -0.000409 │\n", - "│ uniform_1 ┆ normal ┆ -0.000216 │\n", - "│ uniform_2 ┆ exp ┆ 0.000031 │\n", - "└───────────┴────────────┴───────────┘\n", - "shape: (10, 3)\n", - "┌───────────┬────────────┬───────────┐\n", - "│ x ┆ y ┆ corr │\n", - "│ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ f64 │\n", - "╞═══════════╪════════════╪═══════════╡\n", - "│ exp ┆ normal ┆ -0.001175 │\n", - "│ exp ┆ fat_normal ┆ 0.001122 │\n", - "│ uniform_1 ┆ uniform_2 ┆ 0.001065 │\n", - "│ normal ┆ fat_normal ┆ -0.000882 │\n", - "│ uniform_2 ┆ fat_normal ┆ 0.000722 │\n", - "│ uniform_1 ┆ exp ┆ -0.000476 │\n", - "│ uniform_1 ┆ normal ┆ -0.00034 │\n", - "│ uniform_2 ┆ normal ┆ -0.000315 │\n", - "│ uniform_1 ┆ fat_normal ┆ 0.000288 │\n", - "│ uniform_2 ┆ exp ┆ -0.000059 │\n", - "└───────────┴────────────┴───────────┘\n", - "shape: (10, 3)\n", - "┌───────────┬────────────┬───────────┐\n", - "│ x ┆ y ┆ corr │\n", - "│ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ f64 │\n", - "╞═══════════╪════════════╪═══════════╡\n", - "│ exp ┆ normal ┆ -0.001763 │\n", - "│ exp ┆ fat_normal ┆ 0.001683 │\n", - "│ uniform_1 ┆ uniform_2 ┆ 0.001597 │\n", - "│ normal ┆ fat_normal ┆ -0.001322 │\n", - "│ uniform_2 ┆ fat_normal ┆ 0.001084 │\n", - "│ uniform_1 ┆ exp ┆ -0.000714 │\n", - "│ uniform_1 ┆ normal ┆ -0.000509 │\n", - "│ uniform_2 ┆ normal ┆ -0.000472 │\n", - "│ uniform_1 ┆ fat_normal ┆ 0.000431 │\n", - "│ uniform_2 ┆ exp ┆ -0.000088 │\n", - "└───────────┴────────────┴───────────┘\n" - ] - } - ], - "source": [ - "print(dia.infer_corr())\n", - "print(dia.infer_corr(method = \"kendall\"))\n", - "print(dia.infer_corr(method = \"spearman\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'numerics': ['uniform_1', 'uniform_2', 'exp', 'normal', 'fat_normal'],\n", - " 'ints': [],\n", - " 'floats': ['uniform_1', 'uniform_2', 'exp', 'normal', 'fat_normal'],\n", - " 'strs': [],\n", - " 'bools': [],\n", - " 'cats': [],\n", - " 'list_floats': ['list_prob'],\n", - " 'list_bool': [],\n", - " 'list_str': [],\n", - " 'list_ints': [],\n", - " 'simple_types': ['uniform_1',\n", - " 'uniform_2',\n", - " 'exp',\n", - " 'normal',\n", - " 'fat_normal',\n", - " 'list_prob'],\n", - " 'other_types': []}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dia.meta()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['uniform_2', 'list_prob']" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Uniform_2 can potentially be a probability score column (e.g. output of predict_proba, but taking values only for class =1)\n", - "# list_prob can potentially be a 2-class probability column (e.g. output of predict_proba)\n", - "dia.infer_prob()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Dependency Detection, Null Distributions, Distribution Comparisons\n", - "\n", - "Does knowing values in column A tell us values in column B?" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (5, 22)
IDGenderDOBLead_Creation_DateCity_CodeCity_CategoryEmployer_CodeEmployer_Category1Employer_Category2Monthly_IncomeCustomer_Existing_Primary_Bank_CodePrimary_Bank_TypeContactedSourceSource_CategoryExisting_EMILoan_AmountLoan_PeriodInterest_RateEMIVar1Approved
strstrstrstrstrstrstrstri64f64strstrstrstrstrf64i64i64f64i64i64i64
"APPC90493171225""Female""23/07/79""15/07/16""C10001""A""COM0044082""A"42000.0"B001""P""N""S122""G"0.0nullnullnullnull00
"APPD40611263344""Male""07/12/86""04/07/16""C10003""A""COM0000002""C"13500.0"B002""P""Y""S122""G"0.020000213.25953100
"APPE70289249423""Male""10/12/82""19/07/16""C10125""C""COM0005267""C"42250.0"B003""G""Y""S143""B"0.0450004nullnull00
"APPF80273865537""Male""30/01/89""09/07/16""C10477""C""COM0004143""A"43500.0"B003""G""Y""S143""B"0.0920005nullnull70
"APPG60994436641""Male""19/04/85""20/07/16""C10002""A""COM0001781""A"410000.0"B001""P""Y""S134""B"2500.0500002nullnull100
" - ], - "text/plain": [ - "shape: (5, 22)\n", - "┌────────────────┬────────┬──────────┬────────────────┬───┬───────────────┬──────┬──────┬──────────┐\n", - "│ ID ┆ Gender ┆ DOB ┆ Lead_Creation_ ┆ … ┆ Interest_Rate ┆ EMI ┆ Var1 ┆ Approved │\n", - "│ --- ┆ --- ┆ --- ┆ Date ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ str ┆ --- ┆ ┆ f64 ┆ i64 ┆ i64 ┆ i64 │\n", - "│ ┆ ┆ ┆ str ┆ ┆ ┆ ┆ ┆ │\n", - "╞════════════════╪════════╪══════════╪════════════════╪═══╪═══════════════╪══════╪══════╪══════════╡\n", - "│ APPC9049317122 ┆ Female ┆ 23/07/79 ┆ 15/07/16 ┆ … ┆ null ┆ null ┆ 0 ┆ 0 │\n", - "│ 5 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ APPD4061126334 ┆ Male ┆ 07/12/86 ┆ 04/07/16 ┆ … ┆ 13.25 ┆ 953 ┆ 10 ┆ 0 │\n", - "│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ APPE7028924942 ┆ Male ┆ 10/12/82 ┆ 19/07/16 ┆ … ┆ null ┆ null ┆ 0 ┆ 0 │\n", - "│ 3 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ APPF8027386553 ┆ Male ┆ 30/01/89 ┆ 09/07/16 ┆ … ┆ null ┆ null ┆ 7 ┆ 0 │\n", - "│ 7 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ APPG6099443664 ┆ Male ┆ 19/04/85 ┆ 20/07/16 ┆ … ┆ null ┆ null ┆ 10 ┆ 0 │\n", - "│ 1 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "└────────────────┴────────┴──────────┴────────────────┴───┴───────────────┴──────┴──────┴──────────┘" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pl.read_parquet(\"dependency.parquet\")\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(69713, 22)" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "dia = DIA(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_33091/3037619369.py:1: UserWarning: The following columns are dropped because they cannot be used in dependency detection: ['Monthly_Income', 'Existing_EMI', 'Interest_Rate']\n", - " dia.infer_dependency()\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "shape: (171, 3)
columnbycond_entropy
strstrf64
"Gender""ID"0.0
"Contacted""ID"0.0
"Approved""ID"0.0
"Primary_Bank_Type""Customer_Existing_Primary_Bank…0.0
"Primary_Bank_Type""ID"0.0
"Loan_Amount""City_Code"2.702889
"City_Code""EMI"3.147327
"Lead_Creation_Date""EMI"3.92818
"Lead_Creation_Date""City_Code"4.204907
"Lead_Creation_Date""Loan_Amount"4.336805
" - ], - "text/plain": [ - "shape: (171, 3)\n", - "┌────────────────────┬─────────────────────────────────┬──────────────┐\n", - "│ column ┆ by ┆ cond_entropy │\n", - "│ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ f64 │\n", - "╞════════════════════╪═════════════════════════════════╪══════════════╡\n", - "│ Gender ┆ ID ┆ 0.0 │\n", - "│ Contacted ┆ ID ┆ 0.0 │\n", - "│ Approved ┆ ID ┆ 0.0 │\n", - "│ Primary_Bank_Type ┆ Customer_Existing_Primary_Bank… ┆ 0.0 │\n", - "│ Primary_Bank_Type ┆ ID ┆ 0.0 │\n", - "│ … ┆ … ┆ … │\n", - "│ Loan_Amount ┆ City_Code ┆ 2.702889 │\n", - "│ City_Code ┆ EMI ┆ 3.147327 │\n", - "│ Lead_Creation_Date ┆ EMI ┆ 3.92818 │\n", - "│ Lead_Creation_Date ┆ City_Code ┆ 4.204907 │\n", - "│ Lead_Creation_Date ┆ Loan_Amount ┆ 4.336805 │\n", - "└────────────────────┴─────────────────────────────────┴──────────────┘" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dia.infer_dependency()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/tq/Projects/polars_ds_extension/python/polars_ds/diagnosis.py:796: UserWarning: The following columns are dropped because they cannot be used in dependency detection: ['Monthly_Income', 'Existing_EMI', 'Interest_Rate']\n", - " dep_frame = self.infer_dependency(subset=subset)\n" - ] - }, - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Dependency Plot\n", - "\n", - "\n", - "\n", - "Contacted\n", - "\n", - "Contacted\n", - "\n", - "\n", - "\n", - "ID\n", - "\n", - "ID\n", - "\n", - "\n", - "\n", - "ID->Contacted\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Loan_Amount\n", - "\n", - "Loan_Amount\n", - "\n", - "\n", - "\n", - "ID->Loan_Amount\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Var1\n", - "\n", - "Var1\n", - "\n", - "\n", - "\n", - "ID->Var1\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Employer_Code\n", - "\n", - "Employer_Code\n", - "\n", - "\n", - "\n", - "ID->Employer_Code\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Approved\n", - "\n", - "Approved\n", - "\n", - "\n", - "\n", - "ID->Approved\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "EMI\n", - "\n", - "EMI\n", - "\n", - "\n", - "\n", - "ID->EMI\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "City_Code\n", - "\n", - "City_Code\n", - "\n", - "\n", - "\n", - "ID->City_Code\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Loan_Period\n", - "\n", - "Loan_Period\n", - "\n", - "\n", - "\n", - "ID->Loan_Period\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Lead_Creation_Date\n", - "\n", - "Lead_Creation_Date\n", - "\n", - "\n", - "\n", - "ID->Lead_Creation_Date\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Source_Category\n", - "\n", - "Source_Category\n", - "\n", - "\n", - "\n", - "ID->Source_Category\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Customer_Existing_Primary_Bank_Code\n", - "\n", - "Customer_Existing_Primary_Bank_Code\n", - "\n", - "\n", - "\n", - "ID->Customer_Existing_Primary_Bank_Code\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "DOB\n", - "\n", - "DOB\n", - "\n", - "\n", - "\n", - "ID->DOB\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Gender\n", - "\n", - "Gender\n", - "\n", - "\n", - "\n", - "ID->Gender\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Source\n", - "\n", - "Source\n", - "\n", - "\n", - "\n", - "ID->Source\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Employer_Category2\n", - "\n", - "Employer_Category2\n", - "\n", - "\n", - "\n", - "Employer_Code->Employer_Category2\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Employer_Category1\n", - "\n", - "Employer_Category1\n", - "\n", - "\n", - "\n", - "Employer_Code->Employer_Category1\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "City_Category\n", - "\n", - "City_Category\n", - "\n", - "\n", - "\n", - "City_Code->City_Category\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Primary_Bank_Type\n", - "\n", - "Primary_Bank_Type\n", - "\n", - "\n", - "\n", - "Customer_Existing_Primary_Bank_Code->Primary_Bank_Type\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dia.plot_dependency()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/tq/Projects/polars_ds_extension/python/polars_ds/diagnosis.py:796: UserWarning: The following columns are dropped because they cannot be used in dependency detection: ['Monthly_Income', 'Existing_EMI', 'Interest_Rate']\n", - " dep_frame = self.infer_dependency(subset=subset)\n" - ] - }, - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Dependency Plot\n", - "\n", - "\n", - "\n", - "City_Category\n", - "\n", - "City_Category\n", - "\n", - "\n", - "\n", - "City_Code\n", - "\n", - "City_Code\n", - "\n", - "\n", - "\n", - "City_Code->City_Category\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Employer_Category1\n", - "\n", - "Employer_Category1\n", - "\n", - "\n", - "\n", - "Employer_Code\n", - "\n", - "Employer_Code\n", - "\n", - "\n", - "\n", - "Employer_Code->Employer_Category1\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Employer_Category2\n", - "\n", - "Employer_Category2\n", - "\n", - "\n", - "\n", - "Employer_Code->Employer_Category2\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Primary_Bank_Type\n", - "\n", - "Primary_Bank_Type\n", - "\n", - "\n", - "\n", - "Customer_Existing_Primary_Bank_Code\n", - "\n", - "Customer_Existing_Primary_Bank_Code\n", - "\n", - "\n", - "\n", - "Customer_Existing_Primary_Bank_Code->Primary_Bank_Type\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# ID implies everything, of course, because ID is unique.\n", - "# So let's not plot it\n", - "dia.plot_dependency(subset=pl.all().exclude(\"ID\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (13, 13)
columnnull_countn_uniquemost_freqmost_freq_cntmin_byte_lenmin_char_lenavg_byte_lenavg_char_lenmax_byte_lenmax_char_len5p_byte_len95p_byte_len
stru32u32stru32u32u32f64f64u32u32f64f64
"ID"069713"APPC90493171225"1151515.015.0151515.015.0
"Gender"02"Male"39949444.8539014.853901664.06.0
"DOB"1510760"11/01/82"253888.08.0888.08.0
"Lead_Creation_Date"092"02/09/16"1838888.08.0888.08.0
"City_Code"814679"C10001"10007666.06.0666.06.0
"Customer_Existing_Primary_Bank…939158"B001"14197444.04.0444.04.0
"Primary_Bank_Type"93913"P"39619111.01.0111.01.0
"Contacted"02"Y"45275111.01.0111.01.0
"Source"029"S122"30941444.04.0444.04.0
"Source_Category"07"B"29812111.01.0111.01.0
" - ], - "text/plain": [ - "shape: (13, 13)\n", - "┌───────────┬───────────┬──────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐\n", - "│ column ┆ null_coun ┆ n_unique ┆ most_freq ┆ … ┆ max_byte_ ┆ max_char_ ┆ 5p_byte_l ┆ 95p_byte_ │\n", - "│ --- ┆ t ┆ --- ┆ --- ┆ ┆ len ┆ len ┆ en ┆ len │\n", - "│ str ┆ --- ┆ u32 ┆ str ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ ┆ u32 ┆ ┆ ┆ ┆ u32 ┆ u32 ┆ f64 ┆ f64 │\n", - "╞═══════════╪═══════════╪══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡\n", - "│ ID ┆ 0 ┆ 69713 ┆ APPC90493 ┆ … ┆ 15 ┆ 15 ┆ 15.0 ┆ 15.0 │\n", - "│ ┆ ┆ ┆ 171225 ┆ ┆ ┆ ┆ ┆ │\n", - "│ Gender ┆ 0 ┆ 2 ┆ Male ┆ … ┆ 6 ┆ 6 ┆ 4.0 ┆ 6.0 │\n", - "│ DOB ┆ 15 ┆ 10760 ┆ 11/01/82 ┆ … ┆ 8 ┆ 8 ┆ 8.0 ┆ 8.0 │\n", - "│ Lead_Crea ┆ 0 ┆ 92 ┆ 02/09/16 ┆ … ┆ 8 ┆ 8 ┆ 8.0 ┆ 8.0 │\n", - "│ tion_Date ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ City_Code ┆ 814 ┆ 679 ┆ C10001 ┆ … ┆ 6 ┆ 6 ┆ 6.0 ┆ 6.0 │\n", - "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", - "│ Customer_ ┆ 9391 ┆ 58 ┆ B001 ┆ … ┆ 4 ┆ 4 ┆ 4.0 ┆ 4.0 │\n", - "│ Existing_ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ Primary_B ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ ank… ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ Primary_B ┆ 9391 ┆ 3 ┆ P ┆ … ┆ 1 ┆ 1 ┆ 1.0 ┆ 1.0 │\n", - "│ ank_Type ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ Contacted ┆ 0 ┆ 2 ┆ Y ┆ … ┆ 1 ┆ 1 ┆ 1.0 ┆ 1.0 │\n", - "│ Source ┆ 0 ┆ 29 ┆ S122 ┆ … ┆ 4 ┆ 4 ┆ 4.0 ┆ 4.0 │\n", - "│ Source_Ca ┆ 0 ┆ 7 ┆ B ┆ … ┆ 1 ┆ 1 ┆ 1.0 ┆ 1.0 │\n", - "│ tegory ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "└───────────┴───────────┴──────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Basic stats about string columns\n", - "dia.str_stats()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Numerical EDA" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "
columnnon_null_cntnull%meanstdminq1medianq3maxIQRoutlier_cnthistogram
Employer_Category2654156.17%3.7200.8071.0004.0004.0004.0004.0000.0007833
57.6K04.30K4.26K1.96K1.62K57.6K
Monthly_Income697130.00%5,622.283174,767.0620.0001,650.0002,500.0004,000.00038,383,838.3002,350.0003920
69.7K069.7K53411
Existing_EMI696620.07%360.9292,288.5180.0000.0000.000350.000545,436.500350.0006549
69.7K069.7K511
Loan_Amount4200439.75%39,429.98330,727.5965,000.00020,000.00030,000.00050,000.000300,000.00030,000.0001547
27.7K027.7K8.65K15.0K6.42K5.98K1.61K1.03K1.72K5881556272659857294324
Loan_Period4200439.75%3.8911.1671.0003.0004.0005.0006.0002.0000
27.7K027.7K1.89K4.27K7.06K12.1K16.7K1
Interest_Rate2227668.05%19.2145.84711.99015.25018.00020.00037.0004.7502378
4.35K07582.75K4.35K2.44K1.44K2.44K3.97K15685865712184797321.43K290407240
EMI2227668.05%1,101.466752.661118.000649.000941.0001,295.00013,556.000646.0001081
47.4K047.4K8.18K9.89K2.41K9793702631064617121112
Var1697130.00%3.9483.8190.0000.0002.0007.00010.0007.0000
23.3K023.3K13.4K7.67K11.9K13.4K
Approved697130.00%0.0150.1200.0000.0000.0000.0001.0000.0001020
68.7K068.7K1.02K
\n", - "\n", - "
\n", - " " - ], - "text/plain": [ - "GT(_tbl_data=shape: (9, 13)\n", - "┌────────────┬────────────┬──────────┬────────────┬───┬──────────┬─────────┬───────────┬───────────┐\n", - "│ column ┆ non_null_c ┆ null% ┆ mean ┆ … ┆ max ┆ IQR ┆ outlier_c ┆ histogram │\n", - "│ --- ┆ nt ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ nt ┆ --- │\n", - "│ str ┆ --- ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ --- ┆ struct[1] │\n", - "│ ┆ u32 ┆ ┆ ┆ ┆ ┆ ┆ u32 ┆ │\n", - "╞════════════╪════════════╪══════════╪════════════╪═══╪══════════╪═════════╪═══════════╪═══════════╡\n", - "│ Employer_C ┆ 65415 ┆ 0.061653 ┆ 3.720187 ┆ … ┆ 4.0 ┆ 0.0 ┆ 7833 ┆ {[4298, │\n", - "│ ategory2 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 4258, … │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 57582]} │\n", - "│ Monthly_In ┆ 69713 ┆ 0.0 ┆ 5622.2832 ┆ … ┆ 3.8384e7 ┆ 2350.0 ┆ 3920 ┆ {[69699, │\n", - "│ come ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 5, … 1]} │\n", - "│ Existing_E ┆ 69662 ┆ 0.000732 ┆ 360.928751 ┆ … ┆ 545436.5 ┆ 350.0 ┆ 6549 ┆ {[69655, │\n", - "│ MI ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 5, … 1]} │\n", - "│ Loan_Amoun ┆ 42004 ┆ 0.397472 ┆ 39429.9828 ┆ … ┆ 300000.0 ┆ 30000.0 ┆ 1547 ┆ {[27709, │\n", - "│ t ┆ ┆ ┆ 59 ┆ ┆ ┆ ┆ ┆ 8646, … │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 4]} │\n", - "│ Loan_Perio ┆ 42004 ┆ 0.397472 ┆ 3.890629 ┆ … ┆ 6.0 ┆ 2.0 ┆ 0 ┆ {[27709, │\n", - "│ d ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 1886, … │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 1]} │\n", - "│ Interest_R ┆ 22276 ┆ 0.680461 ┆ 19.21357 ┆ … ┆ 37.0 ┆ 4.75 ┆ 2378 ┆ {[758, │\n", - "│ ate ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2752, … │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 240]} │\n", - "│ EMI ┆ 22276 ┆ 0.680461 ┆ 1101.46624 ┆ … ┆ 13556.0 ┆ 646.0 ┆ 1081 ┆ {[47437, │\n", - "│ ┆ ┆ ┆ 2 ┆ ┆ ┆ ┆ ┆ 8179, … │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2]} │\n", - "│ Var1 ┆ 69713 ┆ 0.0 ┆ 3.948446 ┆ … ┆ 10.0 ┆ 7.0 ┆ 0 ┆ {[23308, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 13363, … │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 13420]} │\n", - "│ Approved ┆ 69713 ┆ 0.0 ┆ 0.014631 ┆ … ┆ 1.0 ┆ 0.0 ┆ 1020 ┆ {[68693, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 1020]} │\n", - "└────────────┴────────────┴──────────┴────────────┴───┴──────────┴─────────┴───────────┴───────────┘, _body=, _boxhead=Boxhead([ColInfo(var='column', type=, column_label='column', column_align='left', column_width=None), ColInfo(var='non_null_cnt', type=, column_label='non_null_cnt', column_align='center', column_width=None), ColInfo(var='null%', type=, column_label='null%', column_align='right', column_width=None), ColInfo(var='mean', type=, column_label='mean', column_align='right', column_width=None), ColInfo(var='std', type=, column_label='std', column_align='right', column_width=None), ColInfo(var='min', type=, column_label='min', column_align='right', column_width=None), ColInfo(var='q1', type=, column_label='q1', column_align='right', column_width=None), ColInfo(var='median', type=, column_label='median', column_align='right', column_width=None), ColInfo(var='q3', type=, column_label='q3', column_align='right', column_width=None), ColInfo(var='max', type=, column_label='max', column_align='right', column_width=None), ColInfo(var='IQR', type=, column_label='IQR', column_align='right', column_width=None), ColInfo(var='outlier_cnt', type=, column_label='outlier_cnt', column_align='center', column_width=None), ColInfo(var='histogram', type=, column_label='histogram', column_align='center', column_width=None)]), _stub=, _spanners=Spanners([]), _heading=Heading(title=None, subtitle=None, preheader=None), _stubhead='column', _source_notes=[], _footnotes=[], _styles=[], _locale=, _formats=[, , ], _substitutions=[], _options=Options(table_id=OptionsInfo(scss=False, category='table', type='value', value=None), table_caption=OptionsInfo(scss=False, category='table', type='value', value=None), table_width=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_layout=OptionsInfo(scss=True, category='table', type='value', value='fixed'), table_margin_left=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_margin_right=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_background_color=OptionsInfo(scss=True, category='table', type='value', value='#FFFFFF'), table_additional_css=OptionsInfo(scss=False, category='table', type='values', value=[]), table_font_names=OptionsInfo(scss=False, category='table', type='values', value=['-apple-system', 'BlinkMacSystemFont', 'Segoe UI', 'Roboto', 'Oxygen', 'Ubuntu', 'Cantarell', 'Helvetica Neue', 'Fira Sans', 'Droid Sans', 'Arial', 'sans-serif']), table_font_size=OptionsInfo(scss=True, category='table', type='px', value='16px'), table_font_weight=OptionsInfo(scss=True, category='table', type='value', value='normal'), table_font_style=OptionsInfo(scss=True, category='table', type='value', value='normal'), table_font_color=OptionsInfo(scss=True, category='table', type='value', value='#333333'), table_font_color_light=OptionsInfo(scss=True, category='table', type='value', value='#FFFFFF'), table_border_top_include=OptionsInfo(scss=False, category='table', type='boolean', value=True), table_border_top_style=OptionsInfo(scss=True, category='table', type='value', value='solid'), table_border_top_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_top_color=OptionsInfo(scss=True, category='table', type='value', value='#A8A8A8'), table_border_right_style=OptionsInfo(scss=True, category='table', type='value', value='none'), table_border_right_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_right_color=OptionsInfo(scss=True, category='table', type='value', value='#D3D3D3'), table_border_bottom_include=OptionsInfo(scss=False, category='table', type='boolean', value=True), table_border_bottom_style=OptionsInfo(scss=True, category='table', type='value', value='solid'), table_border_bottom_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_bottom_color=OptionsInfo(scss=True, category='table', type='value', value='#A8A8A8'), table_border_left_style=OptionsInfo(scss=True, category='table', type='value', value='none'), table_border_left_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_left_color=OptionsInfo(scss=True, category='table', type='value', value='#D3D3D3'), heading_background_color=OptionsInfo(scss=True, category='heading', type='value', value=None), heading_align=OptionsInfo(scss=True, category='heading', type='value', value='center'), heading_title_font_size=OptionsInfo(scss=True, category='heading', type='px', value='125%'), heading_title_font_weight=OptionsInfo(scss=True, category='heading', type='value', value='initial'), heading_subtitle_font_size=OptionsInfo(scss=True, category='heading', type='px', value='85%'), heading_subtitle_font_weight=OptionsInfo(scss=True, category='heading', type='value', value='initial'), heading_padding=OptionsInfo(scss=True, category='heading', type='px', value='4px'), heading_padding_horizontal=OptionsInfo(scss=True, category='heading', type='px', value='5px'), heading_border_bottom_style=OptionsInfo(scss=True, category='heading', type='value', value='solid'), heading_border_bottom_width=OptionsInfo(scss=True, category='heading', type='px', value='2px'), heading_border_bottom_color=OptionsInfo(scss=True, category='heading', type='value', value='#D3D3D3'), heading_border_lr_style=OptionsInfo(scss=True, category='heading', type='value', value='none'), heading_border_lr_width=OptionsInfo(scss=True, category='heading', type='px', value='1px'), heading_border_lr_color=OptionsInfo(scss=True, category='heading', type='value', value='#D3D3D3'), column_labels_background_color=OptionsInfo(scss=True, category='column_labels', type='value', value=None), column_labels_font_size=OptionsInfo(scss=True, category='column_labels', type='px', value='100%'), column_labels_font_weight=OptionsInfo(scss=True, category='column_labels', type='value', value='normal'), column_labels_text_transform=OptionsInfo(scss=True, category='column_labels', type='value', value='inherit'), column_labels_padding=OptionsInfo(scss=True, category='column_labels', type='px', value='5px'), column_labels_padding_horizontal=OptionsInfo(scss=True, category='column_labels', type='px', value='5px'), column_labels_vlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='none'), column_labels_vlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), column_labels_vlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), column_labels_border_top_style=OptionsInfo(scss=True, category='column_labels', type='value', value='solid'), column_labels_border_top_width=OptionsInfo(scss=True, category='column_labels', type='px', value='2px'), column_labels_border_top_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_border_bottom_style=OptionsInfo(scss=True, category='column_labels', type='value', value='solid'), column_labels_border_bottom_width=OptionsInfo(scss=True, category='column_labels', type='px', value='2px'), column_labels_border_bottom_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_border_lr_style=OptionsInfo(scss=True, category='column_labels', type='value', value='none'), column_labels_border_lr_width=OptionsInfo(scss=True, category='column_labels', type='px', value='1px'), column_labels_border_lr_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_hidden=OptionsInfo(scss=False, category='column_labels', type='boolean', value=False), row_group_background_color=OptionsInfo(scss=True, category='row_group', type='value', value=None), row_group_font_size=OptionsInfo(scss=True, category='row_group', type='px', value='100%'), row_group_font_weight=OptionsInfo(scss=True, category='row_group', type='value', value='initial'), row_group_text_transform=OptionsInfo(scss=True, category='row_group', type='value', value='inherit'), row_group_padding=OptionsInfo(scss=True, category='row_group', type='px', value='8px'), row_group_padding_horizontal=OptionsInfo(scss=True, category='row_group', type='px', value='5px'), row_group_border_top_style=OptionsInfo(scss=True, category='row_group', type='value', value='solid'), row_group_border_top_width=OptionsInfo(scss=True, category='row_group', type='px', value='2px'), row_group_border_top_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_right_style=OptionsInfo(scss=True, category='row_group', type='value', value='none'), row_group_border_right_width=OptionsInfo(scss=True, category='row_group', type='px', value='1px'), row_group_border_right_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_bottom_style=OptionsInfo(scss=True, category='row_group', type='value', value='solid'), row_group_border_bottom_width=OptionsInfo(scss=True, category='row_group', type='px', value='2px'), row_group_border_bottom_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_left_style=OptionsInfo(scss=True, category='row_group', type='value', value='none'), row_group_border_left_width=OptionsInfo(scss=True, category='row_group', type='px', value='1px'), row_group_border_left_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_as_column=OptionsInfo(scss=False, category='row_group', type='boolean', value=False), table_body_hlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_hlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), table_body_hlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_vlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='none'), table_body_vlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), table_body_vlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_border_top_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_border_top_width=OptionsInfo(scss=True, category='table_body', type='px', value='2px'), table_body_border_top_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_border_bottom_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_border_bottom_width=OptionsInfo(scss=True, category='table_body', type='px', value='2px'), table_body_border_bottom_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), data_row_padding=OptionsInfo(scss=True, category='data_row', type='px', value='8px'), data_row_padding_horizontal=OptionsInfo(scss=True, category='data_row', type='px', value='5px'), stub_background_color=OptionsInfo(scss=True, category='stub', type='value', value=None), stub_font_size=OptionsInfo(scss=True, category='stub', type='px', value='100%'), stub_font_weight=OptionsInfo(scss=True, category='stub', type='value', value='initial'), stub_text_transform=OptionsInfo(scss=True, category='stub', type='value', value='inherit'), stub_border_style=OptionsInfo(scss=True, category='stub', type='value', value='solid'), stub_border_width=OptionsInfo(scss=True, category='stub', type='px', value='2px'), stub_border_color=OptionsInfo(scss=True, category='stub', type='value', value='#D3D3D3'), stub_row_group_background_color=OptionsInfo(scss=True, category='stub', type='value', value=None), stub_row_group_font_size=OptionsInfo(scss=True, category='stub', type='px', value='100%'), stub_row_group_font_weight=OptionsInfo(scss=True, category='stub', type='value', value='initial'), stub_row_group_text_transform=OptionsInfo(scss=True, category='stub', type='value', value='inherit'), stub_row_group_border_style=OptionsInfo(scss=True, category='stub', type='value', value='solid'), stub_row_group_border_width=OptionsInfo(scss=True, category='stub', type='px', value='2px'), stub_row_group_border_color=OptionsInfo(scss=True, category='stub', type='value', value='#D3D3D3'), source_notes_padding=OptionsInfo(scss=True, category='source_notes', type='px', value='4px'), source_notes_padding_horizontal=OptionsInfo(scss=True, category='source_notes', type='px', value='5px'), source_notes_background_color=OptionsInfo(scss=True, category='source_notes', type='value', value=None), source_notes_font_size=OptionsInfo(scss=True, category='source_notes', type='px', value='90%'), source_notes_border_bottom_style=OptionsInfo(scss=True, category='source_notes', type='value', value='none'), source_notes_border_bottom_width=OptionsInfo(scss=True, category='source_notes', type='px', value='2px'), source_notes_border_bottom_color=OptionsInfo(scss=True, category='source_notes', type='value', value='#D3D3D3'), source_notes_border_lr_style=OptionsInfo(scss=True, category='source_notes', type='value', value='none'), source_notes_border_lr_width=OptionsInfo(scss=True, category='source_notes', type='px', value='2px'), source_notes_border_lr_color=OptionsInfo(scss=True, category='source_notes', type='value', value='#D3D3D3'), source_notes_multiline=OptionsInfo(scss=False, category='source_notes', type='boolean', value=True), source_notes_sep=OptionsInfo(scss=False, category='source_notes', type='value', value=' '), row_striping_background_color=OptionsInfo(scss=True, category='row', type='value', value='rgba(128,128,128,0.05)'), row_striping_include_stub=OptionsInfo(scss=False, category='row', type='boolean', value=False), row_striping_include_table_body=OptionsInfo(scss=False, category='row', type='boolean', value=False), container_width=OptionsInfo(scss=False, category='container', type='px', value='auto'), container_height=OptionsInfo(scss=False, category='container', type='px', value='auto'), container_padding_x=OptionsInfo(scss=False, category='container', type='px', value='0px'), container_padding_y=OptionsInfo(scss=False, category='container', type='px', value='10px'), container_overflow_x=OptionsInfo(scss=False, category='container', type='overflow', value='auto'), container_overflow_y=OptionsInfo(scss=False, category='container', type='overflow', value='auto'), quarto_disable_processing=OptionsInfo(scss=False, category='quarto', type='logical', value=False), quarto_use_bootstrap=OptionsInfo(scss=False, category='quarto', type='logical', value=False)), _has_built=False)" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pl.read_parquet(\"dependency.parquet\")\n", - "df.head()\n", - "\n", - "\n", - "dia = DIA(df)\n", - "\n", - "dia.numeric_profile(iqr_multiplier=2)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (5, 22)
IDGenderDOBLead_Creation_DateCity_CodeCity_CategoryEmployer_CodeEmployer_Category1Employer_Category2Monthly_IncomeCustomer_Existing_Primary_Bank_CodePrimary_Bank_TypeContactedSourceSource_CategoryExisting_EMILoan_AmountLoan_PeriodInterest_RateEMIVar1Approved
strstrstrstrstrstrstrstri64f64strstrstrstrstrf64i64i64f64i64i64i64
"APPC90493171225""Female""23/07/79""15/07/16""C10001""A""COM0044082""A"42000.0"B001""P""N""S122""G"0.0nullnullnullnull00
"APPD40611263344""Male""07/12/86""04/07/16""C10003""A""COM0000002""C"13500.0"B002""P""Y""S122""G"0.020000213.25953100
"APPE70289249423""Male""10/12/82""19/07/16""C10125""C""COM0005267""C"42250.0"B003""G""Y""S143""B"0.0450004nullnull00
"APPF80273865537""Male""30/01/89""09/07/16""C10477""C""COM0004143""A"43500.0"B003""G""Y""S143""B"0.0920005nullnull70
"APPG60994436641""Male""19/04/85""20/07/16""C10002""A""COM0001781""A"410000.0"B001""P""Y""S134""B"2500.0500002nullnull100
" - ], - "text/plain": [ - "shape: (5, 22)\n", - "┌────────────────┬────────┬──────────┬────────────────┬───┬───────────────┬──────┬──────┬──────────┐\n", - "│ ID ┆ Gender ┆ DOB ┆ Lead_Creation_ ┆ … ┆ Interest_Rate ┆ EMI ┆ Var1 ┆ Approved │\n", - "│ --- ┆ --- ┆ --- ┆ Date ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ str ┆ --- ┆ ┆ f64 ┆ i64 ┆ i64 ┆ i64 │\n", - "│ ┆ ┆ ┆ str ┆ ┆ ┆ ┆ ┆ │\n", - "╞════════════════╪════════╪══════════╪════════════════╪═══╪═══════════════╪══════╪══════╪══════════╡\n", - "│ APPC9049317122 ┆ Female ┆ 23/07/79 ┆ 15/07/16 ┆ … ┆ null ┆ null ┆ 0 ┆ 0 │\n", - "│ 5 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ APPD4061126334 ┆ Male ┆ 07/12/86 ┆ 04/07/16 ┆ … ┆ 13.25 ┆ 953 ┆ 10 ┆ 0 │\n", - "│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ APPE7028924942 ┆ Male ┆ 10/12/82 ┆ 19/07/16 ┆ … ┆ null ┆ null ┆ 0 ┆ 0 │\n", - "│ 3 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ APPF8027386553 ┆ Male ┆ 30/01/89 ┆ 09/07/16 ┆ … ┆ null ┆ null ┆ 7 ┆ 0 │\n", - "│ 7 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ APPG6099443664 ┆ Male ┆ 19/04/85 ┆ 20/07/16 ┆ … ┆ null ┆ null ┆ 10 ┆ 0 │\n", - "│ 1 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "└────────────────┴────────┴──────────┴────────────────┴───┴───────────────┴──────┴──────┴──────────┘" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (2, 10)
columnEmployer_Category2Monthly_IncomeExisting_EMILoan_AmountLoan_PeriodInterest_RateEMIVar1Approved
strf64f64f64f64f64f64f64f64f64
"Monthly_Income"0.0015461.00.2458260.039998-0.003671-0.0147890.0351630.0248540.000472
"Existing_EMI"-0.0170740.2458261.00.008653-0.004603-0.0230010.0008130.006620.027821
" - ], - "text/plain": [ - "shape: (2, 10)\n", - "┌────────────┬────────────┬───────────┬───────────┬───┬───────────┬──────────┬──────────┬──────────┐\n", - "│ column ┆ Employer_C ┆ Monthly_I ┆ Existing_ ┆ … ┆ Interest_ ┆ EMI ┆ Var1 ┆ Approved │\n", - "│ --- ┆ ategory2 ┆ ncome ┆ EMI ┆ ┆ Rate ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ f64 ┆ f64 ┆ f64 │\n", - "│ ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ ┆ ┆ │\n", - "╞════════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪══════════╪══════════╪══════════╡\n", - "│ Monthly_In ┆ 0.001546 ┆ 1.0 ┆ 0.245826 ┆ … ┆ -0.014789 ┆ 0.035163 ┆ 0.024854 ┆ 0.000472 │\n", - "│ come ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ Existing_E ┆ -0.017074 ┆ 0.245826 ┆ 1.0 ┆ … ┆ -0.023001 ┆ 0.000813 ┆ 0.00662 ┆ 0.027821 │\n", - "│ MI ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "└────────────┴────────────┴───────────┴───────────┴───┴───────────┴──────────┴──────────┴──────────┘" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import polars.selectors as cs\n", - "\n", - "dia.corr(subset=[\"Monthly_Income\", \"Existing_EMI\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (22, 10)
columnEmployer_Category2Monthly_IncomeExisting_EMILoan_AmountLoan_PeriodInterest_RateEMIVar1Approved
strf64f64f64f64f64f64f64f64f64
"ID"-0.0013440.002872-0.0018460.002515-0.004610.0039370.0001440.001832-0.002095
"Gender"-0.0419010.198756-0.143366-0.0123850.04055-0.0108010.0104270.53310.045283
"DOB"-0.0054170.005234-0.0025140.0048040.0061140.0004920.005204-0.000032-0.002547
"Lead_Creation_Date"0.0028680.0066150.009870.001086-0.0793010.014667-0.0147680.039963-0.005199
"City_Code"0.065158-0.0920070.039832-0.0247160.060910.1445210.0597810.030293-0.028195
"Loan_Period"-0.0156740.020818-0.0741730.4916371.0-0.0595340.145961-0.009532-0.000028
"Interest_Rate"0.242253-0.662215-0.040708-0.3561-0.0595341.0-0.284007-0.620129-0.12408
"EMI"-0.0686920.480323-0.2961460.8896930.145961-0.2840071.00.29340.040533
"Var1"-0.1143330.67349-0.0154270.296295-0.009532-0.6201290.29341.00.103802
"Approved"-0.020020.1201550.0566690.042231-0.000028-0.124080.0405330.1038021.0
" - ], - "text/plain": [ - "shape: (22, 10)\n", - "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", - "│ column ┆ Employer_ ┆ Monthly_I ┆ Existing_ ┆ … ┆ Interest_ ┆ EMI ┆ Var1 ┆ Approved │\n", - "│ --- ┆ Category2 ┆ ncome ┆ EMI ┆ ┆ Rate ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ f64 ┆ f64 ┆ f64 │\n", - "│ ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ ┆ ┆ │\n", - "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", - "│ ID ┆ -0.001344 ┆ 0.002872 ┆ -0.001846 ┆ … ┆ 0.003937 ┆ 0.000144 ┆ 0.001832 ┆ -0.00209 │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 5 │\n", - "│ Gender ┆ -0.041901 ┆ 0.198756 ┆ -0.143366 ┆ … ┆ -0.010801 ┆ 0.010427 ┆ 0.5331 ┆ 0.045283 │\n", - "│ DOB ┆ -0.005417 ┆ 0.005234 ┆ -0.002514 ┆ … ┆ 0.000492 ┆ 0.005204 ┆ -0.000032 ┆ -0.00254 │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 7 │\n", - "│ Lead_Crea ┆ 0.002868 ┆ 0.006615 ┆ 0.00987 ┆ … ┆ 0.014667 ┆ -0.014768 ┆ 0.039963 ┆ -0.00519 │\n", - "│ tion_Date ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 9 │\n", - "│ City_Code ┆ 0.065158 ┆ -0.092007 ┆ 0.039832 ┆ … ┆ 0.144521 ┆ 0.059781 ┆ 0.030293 ┆ -0.02819 │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 5 │\n", - "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", - "│ Loan_Peri ┆ -0.015674 ┆ 0.020818 ┆ -0.074173 ┆ … ┆ -0.059534 ┆ 0.145961 ┆ -0.009532 ┆ -0.00002 │\n", - "│ od ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 8 │\n", - "│ Interest_ ┆ 0.242253 ┆ -0.662215 ┆ -0.040708 ┆ … ┆ 1.0 ┆ -0.284007 ┆ -0.620129 ┆ -0.12408 │\n", - "│ Rate ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ EMI ┆ -0.068692 ┆ 0.480323 ┆ -0.296146 ┆ … ┆ -0.284007 ┆ 1.0 ┆ 0.2934 ┆ 0.040533 │\n", - "│ Var1 ┆ -0.114333 ┆ 0.67349 ┆ -0.015427 ┆ … ┆ -0.620129 ┆ 0.2934 ┆ 1.0 ┆ 0.103802 │\n", - "│ Approved ┆ -0.02002 ┆ 0.120155 ┆ 0.056669 ┆ … ┆ -0.12408 ┆ 0.040533 ┆ 0.103802 ┆ 1.0 │\n", - "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dia.corr(subset=cs.all(), method=\"spearman\")" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "
columnEmployer_Category2Monthly_IncomeExisting_EMILoan_AmountLoan_PeriodInterest_RateEMIVar1Approved
Monthly_Income−0.1101.0000.1710.4810.021−0.6620.4800.6730.120
Existing_EMI0.0350.1711.000−0.225−0.074−0.041−0.296−0.0150.057
\n", - "\n", - "
\n", - " " - ], - "text/plain": [ - "GT(_tbl_data=shape: (2, 10)\n", - "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", - "│ column ┆ Employer_ ┆ Monthly_I ┆ Existing_ ┆ … ┆ Interest_ ┆ EMI ┆ Var1 ┆ Approved │\n", - "│ --- ┆ Category2 ┆ ncome ┆ EMI ┆ ┆ Rate ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ f64 ┆ f64 ┆ f64 │\n", - "│ ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ ┆ ┆ │\n", - "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", - "│ Monthly_I ┆ -0.109954 ┆ 1.0 ┆ 0.170825 ┆ … ┆ -0.662215 ┆ 0.480323 ┆ 0.67349 ┆ 0.120155 │\n", - "│ ncome ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ Existing_ ┆ 0.034867 ┆ 0.170825 ┆ 1.0 ┆ … ┆ -0.040708 ┆ -0.296146 ┆ -0.015427 ┆ 0.056669 │\n", - "│ EMI ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘, _body=, _boxhead=Boxhead([ColInfo(var='column', type=, column_label='column', column_align='left', column_width=None), ColInfo(var='Employer_Category2', type=, column_label='Employer_Category2', column_align='right', column_width=None), ColInfo(var='Monthly_Income', type=, column_label='Monthly_Income', column_align='right', column_width=None), ColInfo(var='Existing_EMI', type=, column_label='Existing_EMI', column_align='right', column_width=None), ColInfo(var='Loan_Amount', type=, column_label='Loan_Amount', column_align='right', column_width=None), ColInfo(var='Loan_Period', type=, column_label='Loan_Period', column_align='right', column_width=None), ColInfo(var='Interest_Rate', type=, column_label='Interest_Rate', column_align='right', column_width=None), ColInfo(var='EMI', type=, column_label='EMI', column_align='right', column_width=None), ColInfo(var='Var1', type=, column_label='Var1', column_align='right', column_width=None), ColInfo(var='Approved', type=, column_label='Approved', column_align='right', column_width=None)]), _stub=, _spanners=Spanners([]), _heading=Heading(title=None, subtitle=None, preheader=None), _stubhead=None, _source_notes=[], _footnotes=[], _styles=[StyleInfo(locname=LocBody(columns='Employer_Category2', rows=[0]), grpname=None, colname='Employer_Category2', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#550281')]), StyleInfo(locname=LocBody(columns='Employer_Category2', rows=[1]), grpname=None, colname='Employer_Category2', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#630278')]), StyleInfo(locname=LocBody(columns='Monthly_Income', rows=[0]), grpname=None, colname='Monthly_Income', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#bd0237')]), StyleInfo(locname=LocBody(columns='Monthly_Income', rows=[1]), grpname=None, colname='Monthly_Income', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#6f026f')]), StyleInfo(locname=LocBody(columns='Existing_EMI', rows=[0]), grpname=None, colname='Existing_EMI', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#6f026f')]), StyleInfo(locname=LocBody(columns='Existing_EMI', rows=[1]), grpname=None, colname='Existing_EMI', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#bd0237')]), StyleInfo(locname=LocBody(columns='Loan_Amount', rows=[0]), grpname=None, colname='Loan_Amount', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#8c025a')]), StyleInfo(locname=LocBody(columns='Loan_Amount', rows=[1]), grpname=None, colname='Loan_Amount', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#4a0289')]), StyleInfo(locname=LocBody(columns='Loan_Period', rows=[0]), grpname=None, colname='Loan_Period', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#610279')]), StyleInfo(locname=LocBody(columns='Loan_Period', rows=[1]), grpname=None, colname='Loan_Period', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#59027f')]), StyleInfo(locname=LocBody(columns='Interest_Rate', rows=[0]), grpname=None, colname='Interest_Rate', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#2202a6')]), StyleInfo(locname=LocBody(columns='Interest_Rate', rows=[1]), grpname=None, colname='Interest_Rate', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#5c027d')]), StyleInfo(locname=LocBody(columns='EMI', rows=[0]), grpname=None, colname='EMI', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#8c025a')]), StyleInfo(locname=LocBody(columns='EMI', rows=[1]), grpname=None, colname='EMI', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#44028e')]), StyleInfo(locname=LocBody(columns='Var1', rows=[0]), grpname=None, colname='Var1', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#9e024d')]), StyleInfo(locname=LocBody(columns='Var1', rows=[1]), grpname=None, colname='Var1', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#5e027b')]), StyleInfo(locname=LocBody(columns='Approved', rows=[0]), grpname=None, colname='Approved', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#6b0272')]), StyleInfo(locname=LocBody(columns='Approved', rows=[1]), grpname=None, colname='Approved', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#650276')])], _locale=, _formats=[], _substitutions=[], _options=Options(table_id=OptionsInfo(scss=False, category='table', type='value', value=None), table_caption=OptionsInfo(scss=False, category='table', type='value', value=None), table_width=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_layout=OptionsInfo(scss=True, category='table', type='value', value='fixed'), table_margin_left=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_margin_right=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_background_color=OptionsInfo(scss=True, category='table', type='value', value='#FFFFFF'), table_additional_css=OptionsInfo(scss=False, category='table', type='values', value=[]), table_font_names=OptionsInfo(scss=False, category='table', type='values', value=['-apple-system', 'BlinkMacSystemFont', 'Segoe UI', 'Roboto', 'Oxygen', 'Ubuntu', 'Cantarell', 'Helvetica Neue', 'Fira Sans', 'Droid Sans', 'Arial', 'sans-serif']), table_font_size=OptionsInfo(scss=True, category='table', type='px', value='16px'), table_font_weight=OptionsInfo(scss=True, category='table', type='value', value='normal'), table_font_style=OptionsInfo(scss=True, category='table', type='value', value='normal'), table_font_color=OptionsInfo(scss=True, category='table', type='value', value='#333333'), table_font_color_light=OptionsInfo(scss=True, category='table', type='value', value='#FFFFFF'), table_border_top_include=OptionsInfo(scss=False, category='table', type='boolean', value=True), table_border_top_style=OptionsInfo(scss=True, category='table', type='value', value='solid'), table_border_top_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_top_color=OptionsInfo(scss=True, category='table', type='value', value='#A8A8A8'), table_border_right_style=OptionsInfo(scss=True, category='table', type='value', value='none'), table_border_right_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_right_color=OptionsInfo(scss=True, category='table', type='value', value='#D3D3D3'), table_border_bottom_include=OptionsInfo(scss=False, category='table', type='boolean', value=True), table_border_bottom_style=OptionsInfo(scss=True, category='table', type='value', value='solid'), table_border_bottom_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_bottom_color=OptionsInfo(scss=True, category='table', type='value', value='#A8A8A8'), table_border_left_style=OptionsInfo(scss=True, category='table', type='value', value='none'), table_border_left_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_left_color=OptionsInfo(scss=True, category='table', type='value', value='#D3D3D3'), heading_background_color=OptionsInfo(scss=True, category='heading', type='value', value=None), heading_align=OptionsInfo(scss=True, category='heading', type='value', value='center'), heading_title_font_size=OptionsInfo(scss=True, category='heading', type='px', value='125%'), heading_title_font_weight=OptionsInfo(scss=True, category='heading', type='value', value='initial'), heading_subtitle_font_size=OptionsInfo(scss=True, category='heading', type='px', value='85%'), heading_subtitle_font_weight=OptionsInfo(scss=True, category='heading', type='value', value='initial'), heading_padding=OptionsInfo(scss=True, category='heading', type='px', value='4px'), heading_padding_horizontal=OptionsInfo(scss=True, category='heading', type='px', value='5px'), heading_border_bottom_style=OptionsInfo(scss=True, category='heading', type='value', value='solid'), heading_border_bottom_width=OptionsInfo(scss=True, category='heading', type='px', value='2px'), heading_border_bottom_color=OptionsInfo(scss=True, category='heading', type='value', value='#D3D3D3'), heading_border_lr_style=OptionsInfo(scss=True, category='heading', type='value', value='none'), heading_border_lr_width=OptionsInfo(scss=True, category='heading', type='px', value='1px'), heading_border_lr_color=OptionsInfo(scss=True, category='heading', type='value', value='#D3D3D3'), column_labels_background_color=OptionsInfo(scss=True, category='column_labels', type='value', value=None), column_labels_font_size=OptionsInfo(scss=True, category='column_labels', type='px', value='100%'), column_labels_font_weight=OptionsInfo(scss=True, category='column_labels', type='value', value='normal'), column_labels_text_transform=OptionsInfo(scss=True, category='column_labels', type='value', value='inherit'), column_labels_padding=OptionsInfo(scss=True, category='column_labels', type='px', value='5px'), column_labels_padding_horizontal=OptionsInfo(scss=True, category='column_labels', type='px', value='5px'), column_labels_vlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='none'), column_labels_vlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), column_labels_vlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), column_labels_border_top_style=OptionsInfo(scss=True, category='column_labels', type='value', value='solid'), column_labels_border_top_width=OptionsInfo(scss=True, category='column_labels', type='px', value='2px'), column_labels_border_top_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_border_bottom_style=OptionsInfo(scss=True, category='column_labels', type='value', value='solid'), column_labels_border_bottom_width=OptionsInfo(scss=True, category='column_labels', type='px', value='2px'), column_labels_border_bottom_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_border_lr_style=OptionsInfo(scss=True, category='column_labels', type='value', value='none'), column_labels_border_lr_width=OptionsInfo(scss=True, category='column_labels', type='px', value='1px'), column_labels_border_lr_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_hidden=OptionsInfo(scss=False, category='column_labels', type='boolean', value=False), row_group_background_color=OptionsInfo(scss=True, category='row_group', type='value', value=None), row_group_font_size=OptionsInfo(scss=True, category='row_group', type='px', value='100%'), row_group_font_weight=OptionsInfo(scss=True, category='row_group', type='value', value='initial'), row_group_text_transform=OptionsInfo(scss=True, category='row_group', type='value', value='inherit'), row_group_padding=OptionsInfo(scss=True, category='row_group', type='px', value='8px'), row_group_padding_horizontal=OptionsInfo(scss=True, category='row_group', type='px', value='5px'), row_group_border_top_style=OptionsInfo(scss=True, category='row_group', type='value', value='solid'), row_group_border_top_width=OptionsInfo(scss=True, category='row_group', type='px', value='2px'), row_group_border_top_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_right_style=OptionsInfo(scss=True, category='row_group', type='value', value='none'), row_group_border_right_width=OptionsInfo(scss=True, category='row_group', type='px', value='1px'), row_group_border_right_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_bottom_style=OptionsInfo(scss=True, category='row_group', type='value', value='solid'), row_group_border_bottom_width=OptionsInfo(scss=True, category='row_group', type='px', value='2px'), row_group_border_bottom_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_left_style=OptionsInfo(scss=True, category='row_group', type='value', value='none'), row_group_border_left_width=OptionsInfo(scss=True, category='row_group', type='px', value='1px'), row_group_border_left_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_as_column=OptionsInfo(scss=False, category='row_group', type='boolean', value=False), table_body_hlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_hlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), table_body_hlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_vlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='none'), table_body_vlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), table_body_vlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_border_top_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_border_top_width=OptionsInfo(scss=True, category='table_body', type='px', value='2px'), table_body_border_top_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_border_bottom_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_border_bottom_width=OptionsInfo(scss=True, category='table_body', type='px', value='2px'), table_body_border_bottom_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), data_row_padding=OptionsInfo(scss=True, category='data_row', type='px', value='8px'), data_row_padding_horizontal=OptionsInfo(scss=True, category='data_row', type='px', value='5px'), stub_background_color=OptionsInfo(scss=True, category='stub', type='value', value=None), stub_font_size=OptionsInfo(scss=True, category='stub', type='px', value='100%'), stub_font_weight=OptionsInfo(scss=True, category='stub', type='value', value='initial'), stub_text_transform=OptionsInfo(scss=True, category='stub', type='value', value='inherit'), stub_border_style=OptionsInfo(scss=True, category='stub', type='value', value='solid'), stub_border_width=OptionsInfo(scss=True, category='stub', type='px', value='2px'), stub_border_color=OptionsInfo(scss=True, category='stub', type='value', value='#D3D3D3'), stub_row_group_background_color=OptionsInfo(scss=True, category='stub', type='value', value=None), stub_row_group_font_size=OptionsInfo(scss=True, category='stub', type='px', value='100%'), stub_row_group_font_weight=OptionsInfo(scss=True, category='stub', type='value', value='initial'), stub_row_group_text_transform=OptionsInfo(scss=True, category='stub', type='value', value='inherit'), stub_row_group_border_style=OptionsInfo(scss=True, category='stub', type='value', value='solid'), stub_row_group_border_width=OptionsInfo(scss=True, category='stub', type='px', value='2px'), stub_row_group_border_color=OptionsInfo(scss=True, category='stub', type='value', value='#D3D3D3'), source_notes_padding=OptionsInfo(scss=True, category='source_notes', type='px', value='4px'), source_notes_padding_horizontal=OptionsInfo(scss=True, category='source_notes', type='px', value='5px'), source_notes_background_color=OptionsInfo(scss=True, category='source_notes', type='value', value=None), source_notes_font_size=OptionsInfo(scss=True, category='source_notes', type='px', value='90%'), source_notes_border_bottom_style=OptionsInfo(scss=True, category='source_notes', type='value', value='none'), source_notes_border_bottom_width=OptionsInfo(scss=True, category='source_notes', type='px', value='2px'), source_notes_border_bottom_color=OptionsInfo(scss=True, category='source_notes', type='value', value='#D3D3D3'), source_notes_border_lr_style=OptionsInfo(scss=True, category='source_notes', type='value', value='none'), source_notes_border_lr_width=OptionsInfo(scss=True, category='source_notes', type='px', value='2px'), source_notes_border_lr_color=OptionsInfo(scss=True, category='source_notes', type='value', value='#D3D3D3'), source_notes_multiline=OptionsInfo(scss=False, category='source_notes', type='boolean', value=True), source_notes_sep=OptionsInfo(scss=False, category='source_notes', type='value', value=' '), row_striping_background_color=OptionsInfo(scss=True, category='row', type='value', value='rgba(128,128,128,0.05)'), row_striping_include_stub=OptionsInfo(scss=False, category='row', type='boolean', value=False), row_striping_include_table_body=OptionsInfo(scss=False, category='row', type='boolean', value=False), container_width=OptionsInfo(scss=False, category='container', type='px', value='auto'), container_height=OptionsInfo(scss=False, category='container', type='px', value='auto'), container_padding_x=OptionsInfo(scss=False, category='container', type='px', value='0px'), container_padding_y=OptionsInfo(scss=False, category='container', type='px', value='10px'), container_overflow_x=OptionsInfo(scss=False, category='container', type='overflow', value='auto'), container_overflow_y=OptionsInfo(scss=False, category='container', type='overflow', value='auto'), quarto_disable_processing=OptionsInfo(scss=False, category='quarto', type='logical', value=False), quarto_use_bootstrap=OptionsInfo(scss=False, category='quarto', type='logical', value=False)), _has_built=False)" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dia.plot_corr(subset=[\"Monthly_Income\", \"Existing_EMI\"], method=\"spearman\")" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "
columnEmployer_Category2Monthly_IncomeExisting_EMILoan_AmountLoan_PeriodInterest_RateEMIVar1Approved
Monthly_Income0.3781.0000.1100.7110.6550.8970.8730.4840.028
Existing_EMI0.3850.1201.0000.7070.7000.8680.8750.0460.012
\n", - "\n", - "
\n", - " " - ], - "text/plain": [ - "GT(_tbl_data=shape: (2, 10)\n", - "┌────────────┬────────────┬───────────┬───────────┬───┬───────────┬──────────┬──────────┬──────────┐\n", - "│ column ┆ Employer_C ┆ Monthly_I ┆ Existing_ ┆ … ┆ Interest_ ┆ EMI ┆ Var1 ┆ Approved │\n", - "│ --- ┆ ategory2 ┆ ncome ┆ EMI ┆ ┆ Rate ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ f64 ┆ f64 ┆ f64 │\n", - "│ ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ ┆ ┆ │\n", - "╞════════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪══════════╪══════════╪══════════╡\n", - "│ Monthly_In ┆ 0.377548 ┆ 0.999957 ┆ 0.109975 ┆ … ┆ 0.897349 ┆ 0.873183 ┆ 0.484005 ┆ 0.027934 │\n", - "│ come ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ Existing_E ┆ 0.385481 ┆ 0.119638 ┆ 0.999946 ┆ … ┆ 0.868431 ┆ 0.874944 ┆ 0.045564 ┆ 0.012015 │\n", - "│ MI ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "└────────────┴────────────┴───────────┴───────────┴───┴───────────┴──────────┴──────────┴──────────┘, _body=, _boxhead=Boxhead([ColInfo(var='column', type=, column_label='column', column_align='left', column_width=None), ColInfo(var='Employer_Category2', type=, column_label='Employer_Category2', column_align='right', column_width=None), ColInfo(var='Monthly_Income', type=, column_label='Monthly_Income', column_align='right', column_width=None), ColInfo(var='Existing_EMI', type=, column_label='Existing_EMI', column_align='right', column_width=None), ColInfo(var='Loan_Amount', type=, column_label='Loan_Amount', column_align='right', column_width=None), ColInfo(var='Loan_Period', type=, column_label='Loan_Period', column_align='right', column_width=None), ColInfo(var='Interest_Rate', type=, column_label='Interest_Rate', column_align='right', column_width=None), ColInfo(var='EMI', type=, column_label='EMI', column_align='right', column_width=None), ColInfo(var='Var1', type=, column_label='Var1', column_align='right', column_width=None), ColInfo(var='Approved', type=, column_label='Approved', column_align='right', column_width=None)]), _stub=, _spanners=Spanners([]), _heading=Heading(title=None, subtitle=None, preheader=None), _stubhead=None, _source_notes=[], _footnotes=[], _styles=[StyleInfo(locname=LocBody(columns='Employer_Category2', rows=[0]), grpname=None, colname='Employer_Category2', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#830261')]), StyleInfo(locname=LocBody(columns='Employer_Category2', rows=[1]), grpname=None, colname='Employer_Category2', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#840260')]), StyleInfo(locname=LocBody(columns='Monthly_Income', rows=[0]), grpname=None, colname='Monthly_Income', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#bd0237')]), StyleInfo(locname=LocBody(columns='Monthly_Income', rows=[1]), grpname=None, colname='Monthly_Income', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#6b0272')]), StyleInfo(locname=LocBody(columns='Existing_EMI', rows=[0]), grpname=None, colname='Existing_EMI', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#6a0273')]), StyleInfo(locname=LocBody(columns='Existing_EMI', rows=[1]), grpname=None, colname='Existing_EMI', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#bd0237')]), StyleInfo(locname=LocBody(columns='Loan_Amount', rows=[0]), grpname=None, colname='Loan_Amount', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#a2024a')]), StyleInfo(locname=LocBody(columns='Loan_Amount', rows=[1]), grpname=None, colname='Loan_Amount', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#a2024b')]), StyleInfo(locname=LocBody(columns='Loan_Period', rows=[0]), grpname=None, colname='Loan_Period', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#9d024e')]), StyleInfo(locname=LocBody(columns='Loan_Period', rows=[1]), grpname=None, colname='Loan_Period', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#a1024b')]), StyleInfo(locname=LocBody(columns='Interest_Rate', rows=[0]), grpname=None, colname='Interest_Rate', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#b3023e')]), StyleInfo(locname=LocBody(columns='Interest_Rate', rows=[1]), grpname=None, colname='Interest_Rate', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#b10240')]), StyleInfo(locname=LocBody(columns='EMI', rows=[0]), grpname=None, colname='EMI', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#b1023f')]), StyleInfo(locname=LocBody(columns='EMI', rows=[1]), grpname=None, colname='EMI', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#b1023f')]), StyleInfo(locname=LocBody(columns='Var1', rows=[0]), grpname=None, colname='Var1', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#8d025a')]), StyleInfo(locname=LocBody(columns='Var1', rows=[1]), grpname=None, colname='Var1', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#640277')]), StyleInfo(locname=LocBody(columns='Approved', rows=[0]), grpname=None, colname='Approved', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#620278')]), StyleInfo(locname=LocBody(columns='Approved', rows=[1]), grpname=None, colname='Approved', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#610279')])], _locale=, _formats=[], _substitutions=[], _options=Options(table_id=OptionsInfo(scss=False, category='table', type='value', value=None), table_caption=OptionsInfo(scss=False, category='table', type='value', value=None), table_width=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_layout=OptionsInfo(scss=True, category='table', type='value', value='fixed'), table_margin_left=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_margin_right=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_background_color=OptionsInfo(scss=True, category='table', type='value', value='#FFFFFF'), table_additional_css=OptionsInfo(scss=False, category='table', type='values', value=[]), table_font_names=OptionsInfo(scss=False, category='table', type='values', value=['-apple-system', 'BlinkMacSystemFont', 'Segoe UI', 'Roboto', 'Oxygen', 'Ubuntu', 'Cantarell', 'Helvetica Neue', 'Fira Sans', 'Droid Sans', 'Arial', 'sans-serif']), table_font_size=OptionsInfo(scss=True, category='table', type='px', value='16px'), table_font_weight=OptionsInfo(scss=True, category='table', type='value', value='normal'), table_font_style=OptionsInfo(scss=True, category='table', type='value', value='normal'), table_font_color=OptionsInfo(scss=True, category='table', type='value', value='#333333'), table_font_color_light=OptionsInfo(scss=True, category='table', type='value', value='#FFFFFF'), table_border_top_include=OptionsInfo(scss=False, category='table', type='boolean', value=True), table_border_top_style=OptionsInfo(scss=True, category='table', type='value', value='solid'), table_border_top_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_top_color=OptionsInfo(scss=True, category='table', type='value', value='#A8A8A8'), table_border_right_style=OptionsInfo(scss=True, category='table', type='value', value='none'), table_border_right_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_right_color=OptionsInfo(scss=True, category='table', type='value', value='#D3D3D3'), table_border_bottom_include=OptionsInfo(scss=False, category='table', type='boolean', value=True), table_border_bottom_style=OptionsInfo(scss=True, category='table', type='value', value='solid'), table_border_bottom_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_bottom_color=OptionsInfo(scss=True, category='table', type='value', value='#A8A8A8'), table_border_left_style=OptionsInfo(scss=True, category='table', type='value', value='none'), table_border_left_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_left_color=OptionsInfo(scss=True, category='table', type='value', value='#D3D3D3'), heading_background_color=OptionsInfo(scss=True, category='heading', type='value', value=None), heading_align=OptionsInfo(scss=True, category='heading', type='value', value='center'), heading_title_font_size=OptionsInfo(scss=True, category='heading', type='px', value='125%'), heading_title_font_weight=OptionsInfo(scss=True, category='heading', type='value', value='initial'), heading_subtitle_font_size=OptionsInfo(scss=True, category='heading', type='px', value='85%'), heading_subtitle_font_weight=OptionsInfo(scss=True, category='heading', type='value', value='initial'), heading_padding=OptionsInfo(scss=True, category='heading', type='px', value='4px'), heading_padding_horizontal=OptionsInfo(scss=True, category='heading', type='px', value='5px'), heading_border_bottom_style=OptionsInfo(scss=True, category='heading', type='value', value='solid'), heading_border_bottom_width=OptionsInfo(scss=True, category='heading', type='px', value='2px'), heading_border_bottom_color=OptionsInfo(scss=True, category='heading', type='value', value='#D3D3D3'), heading_border_lr_style=OptionsInfo(scss=True, category='heading', type='value', value='none'), heading_border_lr_width=OptionsInfo(scss=True, category='heading', type='px', value='1px'), heading_border_lr_color=OptionsInfo(scss=True, category='heading', type='value', value='#D3D3D3'), column_labels_background_color=OptionsInfo(scss=True, category='column_labels', type='value', value=None), column_labels_font_size=OptionsInfo(scss=True, category='column_labels', type='px', value='100%'), column_labels_font_weight=OptionsInfo(scss=True, category='column_labels', type='value', value='normal'), column_labels_text_transform=OptionsInfo(scss=True, category='column_labels', type='value', value='inherit'), column_labels_padding=OptionsInfo(scss=True, category='column_labels', type='px', value='5px'), column_labels_padding_horizontal=OptionsInfo(scss=True, category='column_labels', type='px', value='5px'), column_labels_vlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='none'), column_labels_vlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), column_labels_vlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), column_labels_border_top_style=OptionsInfo(scss=True, category='column_labels', type='value', value='solid'), column_labels_border_top_width=OptionsInfo(scss=True, category='column_labels', type='px', value='2px'), column_labels_border_top_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_border_bottom_style=OptionsInfo(scss=True, category='column_labels', type='value', value='solid'), column_labels_border_bottom_width=OptionsInfo(scss=True, category='column_labels', type='px', value='2px'), column_labels_border_bottom_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_border_lr_style=OptionsInfo(scss=True, category='column_labels', type='value', value='none'), column_labels_border_lr_width=OptionsInfo(scss=True, category='column_labels', type='px', value='1px'), column_labels_border_lr_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_hidden=OptionsInfo(scss=False, category='column_labels', type='boolean', value=False), row_group_background_color=OptionsInfo(scss=True, category='row_group', type='value', value=None), row_group_font_size=OptionsInfo(scss=True, category='row_group', type='px', value='100%'), row_group_font_weight=OptionsInfo(scss=True, category='row_group', type='value', value='initial'), row_group_text_transform=OptionsInfo(scss=True, category='row_group', type='value', value='inherit'), row_group_padding=OptionsInfo(scss=True, category='row_group', type='px', value='8px'), row_group_padding_horizontal=OptionsInfo(scss=True, category='row_group', type='px', value='5px'), row_group_border_top_style=OptionsInfo(scss=True, category='row_group', type='value', value='solid'), row_group_border_top_width=OptionsInfo(scss=True, category='row_group', type='px', value='2px'), row_group_border_top_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_right_style=OptionsInfo(scss=True, category='row_group', type='value', value='none'), row_group_border_right_width=OptionsInfo(scss=True, category='row_group', type='px', value='1px'), row_group_border_right_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_bottom_style=OptionsInfo(scss=True, category='row_group', type='value', value='solid'), row_group_border_bottom_width=OptionsInfo(scss=True, category='row_group', type='px', value='2px'), row_group_border_bottom_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_left_style=OptionsInfo(scss=True, category='row_group', type='value', value='none'), row_group_border_left_width=OptionsInfo(scss=True, category='row_group', type='px', value='1px'), row_group_border_left_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_as_column=OptionsInfo(scss=False, category='row_group', type='boolean', value=False), table_body_hlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_hlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), table_body_hlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_vlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='none'), table_body_vlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), table_body_vlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_border_top_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_border_top_width=OptionsInfo(scss=True, category='table_body', type='px', value='2px'), table_body_border_top_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_border_bottom_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_border_bottom_width=OptionsInfo(scss=True, category='table_body', type='px', value='2px'), table_body_border_bottom_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), data_row_padding=OptionsInfo(scss=True, category='data_row', type='px', value='8px'), data_row_padding_horizontal=OptionsInfo(scss=True, category='data_row', type='px', value='5px'), stub_background_color=OptionsInfo(scss=True, category='stub', type='value', value=None), stub_font_size=OptionsInfo(scss=True, category='stub', type='px', value='100%'), stub_font_weight=OptionsInfo(scss=True, category='stub', type='value', value='initial'), stub_text_transform=OptionsInfo(scss=True, category='stub', type='value', value='inherit'), stub_border_style=OptionsInfo(scss=True, category='stub', type='value', value='solid'), stub_border_width=OptionsInfo(scss=True, category='stub', type='px', value='2px'), stub_border_color=OptionsInfo(scss=True, category='stub', type='value', value='#D3D3D3'), stub_row_group_background_color=OptionsInfo(scss=True, category='stub', type='value', value=None), stub_row_group_font_size=OptionsInfo(scss=True, category='stub', type='px', value='100%'), stub_row_group_font_weight=OptionsInfo(scss=True, category='stub', type='value', value='initial'), stub_row_group_text_transform=OptionsInfo(scss=True, category='stub', type='value', value='inherit'), stub_row_group_border_style=OptionsInfo(scss=True, category='stub', type='value', value='solid'), stub_row_group_border_width=OptionsInfo(scss=True, category='stub', type='px', value='2px'), stub_row_group_border_color=OptionsInfo(scss=True, category='stub', type='value', value='#D3D3D3'), source_notes_padding=OptionsInfo(scss=True, category='source_notes', type='px', value='4px'), source_notes_padding_horizontal=OptionsInfo(scss=True, category='source_notes', type='px', value='5px'), source_notes_background_color=OptionsInfo(scss=True, category='source_notes', type='value', value=None), source_notes_font_size=OptionsInfo(scss=True, category='source_notes', type='px', value='90%'), source_notes_border_bottom_style=OptionsInfo(scss=True, category='source_notes', type='value', value='none'), source_notes_border_bottom_width=OptionsInfo(scss=True, category='source_notes', type='px', value='2px'), source_notes_border_bottom_color=OptionsInfo(scss=True, category='source_notes', type='value', value='#D3D3D3'), source_notes_border_lr_style=OptionsInfo(scss=True, category='source_notes', type='value', value='none'), source_notes_border_lr_width=OptionsInfo(scss=True, category='source_notes', type='px', value='2px'), source_notes_border_lr_color=OptionsInfo(scss=True, category='source_notes', type='value', value='#D3D3D3'), source_notes_multiline=OptionsInfo(scss=False, category='source_notes', type='boolean', value=True), source_notes_sep=OptionsInfo(scss=False, category='source_notes', type='value', value=' '), row_striping_background_color=OptionsInfo(scss=True, category='row', type='value', value='rgba(128,128,128,0.05)'), row_striping_include_stub=OptionsInfo(scss=False, category='row', type='boolean', value=False), row_striping_include_table_body=OptionsInfo(scss=False, category='row', type='boolean', value=False), container_width=OptionsInfo(scss=False, category='container', type='px', value='auto'), container_height=OptionsInfo(scss=False, category='container', type='px', value='auto'), container_padding_x=OptionsInfo(scss=False, category='container', type='px', value='0px'), container_padding_y=OptionsInfo(scss=False, category='container', type='px', value='10px'), container_overflow_x=OptionsInfo(scss=False, category='container', type='overflow', value='auto'), container_overflow_y=OptionsInfo(scss=False, category='container', type='overflow', value='auto'), quarto_disable_processing=OptionsInfo(scss=False, category='quarto', type='logical', value=False), quarto_use_bootstrap=OptionsInfo(scss=False, category='quarto', type='logical', value=False)), _has_built=False)" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dia.plot_corr(subset=[\"Monthly_Income\", \"Existing_EMI\"], method=\"xi\")" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "
Null Distribution
columnpercentages in row groupsnull%total
Employer_Category2
0.3500.123.0E−306.8E−307.0E−305.0E−300.0220.0220.350.0210.0230.100.0240.110.046
6.17%69713
Monthly_Income
5-500000000000000
0.00%69713
Existing_EMI
2.4E−30000006.0E−406.0E−404.0E−406.0E−408.0E−402.4E−301.6E−301.2E−301.2E−308.5E−40
0.07%69713
Loan_Amount
0.9800.980.760.550.510.230.300.250.470.190.210.290.240.330.27
39.75%69713
Loan_Period
0.9800.980.760.550.510.230.300.250.470.190.210.290.240.330.27
39.75%69713
Interest_Rate
0.9900.990.850.730.720.550.610.600.710.580.590.660.610.690.65
68.05%69713
EMI
0.9900.990.850.730.720.550.610.600.710.580.590.660.610.690.65
68.05%69713
Var1
5-500000000000000
0.00%69713
Approved
5-500000000000000
0.00%69713
\n", - "\n", - "
\n", - " " - ], - "text/plain": [ - "GT(_tbl_data=shape: (9, 4)\n", - "┌────────────────────┬────────────────────────────────┬──────────┬───────┐\n", - "│ column ┆ percentages in row groups ┆ null% ┆ total │\n", - "│ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ struct[1] ┆ f64 ┆ i32 │\n", - "╞════════════════════╪════════════════════════════════╪══════════╪═══════╡\n", - "│ Employer_Category2 ┆ {[0.121, 0.003, … 0.046467]} ┆ 0.061653 ┆ 69713 │\n", - "│ Monthly_Income ┆ {[0.0, 0.0, … 0.0]} ┆ 0.0 ┆ 69713 │\n", - "│ Existing_EMI ┆ {[0.0, 0.0, … 0.000849]} ┆ 0.000732 ┆ 69713 │\n", - "│ Loan_Amount ┆ {[0.976, 0.757, … 0.267346]} ┆ 0.397472 ┆ 69713 │\n", - "│ Loan_Period ┆ {[0.976, 0.757, … 0.267346]} ┆ 0.397472 ┆ 69713 │\n", - "│ Interest_Rate ┆ {[0.9892, 0.8522, … 0.645237]} ┆ 0.680461 ┆ 69713 │\n", - "│ EMI ┆ {[0.9892, 0.8522, … 0.645237]} ┆ 0.680461 ┆ 69713 │\n", - "│ Var1 ┆ {[0.0, 0.0, … 0.0]} ┆ 0.0 ┆ 69713 │\n", - "│ Approved ┆ {[0.0, 0.0, … 0.0]} ┆ 0.0 ┆ 69713 │\n", - "└────────────────────┴────────────────────────────────┴──────────┴───────┘, _body=, _boxhead=Boxhead([ColInfo(var='column', type=, column_label='column', column_align='left', column_width=None), ColInfo(var='percentages in row groups', type=, column_label='percentages in row groups', column_align='center', column_width=None), ColInfo(var='null%', type=, column_label='null%', column_align='right', column_width=None), ColInfo(var='total', type=, column_label='total', column_align='right', column_width=None)]), _stub=, _spanners=Spanners([]), _heading=Heading(title='Null Distribution', subtitle=None, preheader=None), _stubhead='column', _source_notes=[], _footnotes=[], _styles=[], _locale=, _formats=[, , ], _substitutions=[], _options=Options(table_id=OptionsInfo(scss=False, category='table', type='value', value=None), table_caption=OptionsInfo(scss=False, category='table', type='value', value=None), table_width=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_layout=OptionsInfo(scss=True, category='table', type='value', value='fixed'), table_margin_left=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_margin_right=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_background_color=OptionsInfo(scss=True, category='table', type='value', value='#FFFFFF'), table_additional_css=OptionsInfo(scss=False, category='table', type='values', value=[]), table_font_names=OptionsInfo(scss=False, category='table', type='values', value=['-apple-system', 'BlinkMacSystemFont', 'Segoe UI', 'Roboto', 'Oxygen', 'Ubuntu', 'Cantarell', 'Helvetica Neue', 'Fira Sans', 'Droid Sans', 'Arial', 'sans-serif']), table_font_size=OptionsInfo(scss=True, category='table', type='px', value='16px'), table_font_weight=OptionsInfo(scss=True, category='table', type='value', value='normal'), table_font_style=OptionsInfo(scss=True, category='table', type='value', value='normal'), table_font_color=OptionsInfo(scss=True, category='table', type='value', value='#333333'), table_font_color_light=OptionsInfo(scss=True, category='table', type='value', value='#FFFFFF'), table_border_top_include=OptionsInfo(scss=False, category='table', type='boolean', value=True), table_border_top_style=OptionsInfo(scss=True, category='table', type='value', value='solid'), table_border_top_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_top_color=OptionsInfo(scss=True, category='table', type='value', value='#A8A8A8'), table_border_right_style=OptionsInfo(scss=True, category='table', type='value', value='none'), table_border_right_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_right_color=OptionsInfo(scss=True, category='table', type='value', value='#D3D3D3'), table_border_bottom_include=OptionsInfo(scss=False, category='table', type='boolean', value=True), table_border_bottom_style=OptionsInfo(scss=True, category='table', type='value', value='solid'), table_border_bottom_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_bottom_color=OptionsInfo(scss=True, category='table', type='value', value='#A8A8A8'), table_border_left_style=OptionsInfo(scss=True, category='table', type='value', value='none'), table_border_left_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_left_color=OptionsInfo(scss=True, category='table', type='value', value='#D3D3D3'), heading_background_color=OptionsInfo(scss=True, category='heading', type='value', value=None), heading_align=OptionsInfo(scss=True, category='heading', type='value', value='center'), heading_title_font_size=OptionsInfo(scss=True, category='heading', type='px', value='125%'), heading_title_font_weight=OptionsInfo(scss=True, category='heading', type='value', value='initial'), heading_subtitle_font_size=OptionsInfo(scss=True, category='heading', type='px', value='85%'), heading_subtitle_font_weight=OptionsInfo(scss=True, category='heading', type='value', value='initial'), heading_padding=OptionsInfo(scss=True, category='heading', type='px', value='4px'), heading_padding_horizontal=OptionsInfo(scss=True, category='heading', type='px', value='5px'), heading_border_bottom_style=OptionsInfo(scss=True, category='heading', type='value', value='solid'), heading_border_bottom_width=OptionsInfo(scss=True, category='heading', type='px', value='2px'), heading_border_bottom_color=OptionsInfo(scss=True, category='heading', type='value', value='#D3D3D3'), heading_border_lr_style=OptionsInfo(scss=True, category='heading', type='value', value='none'), heading_border_lr_width=OptionsInfo(scss=True, category='heading', type='px', value='1px'), heading_border_lr_color=OptionsInfo(scss=True, category='heading', type='value', value='#D3D3D3'), column_labels_background_color=OptionsInfo(scss=True, category='column_labels', type='value', value=None), column_labels_font_size=OptionsInfo(scss=True, category='column_labels', type='px', value='100%'), column_labels_font_weight=OptionsInfo(scss=True, category='column_labels', type='value', value='normal'), column_labels_text_transform=OptionsInfo(scss=True, category='column_labels', type='value', value='inherit'), column_labels_padding=OptionsInfo(scss=True, category='column_labels', type='px', value='5px'), column_labels_padding_horizontal=OptionsInfo(scss=True, category='column_labels', type='px', value='5px'), column_labels_vlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='none'), column_labels_vlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), column_labels_vlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), column_labels_border_top_style=OptionsInfo(scss=True, category='column_labels', type='value', value='solid'), column_labels_border_top_width=OptionsInfo(scss=True, category='column_labels', type='px', value='2px'), column_labels_border_top_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_border_bottom_style=OptionsInfo(scss=True, category='column_labels', type='value', value='solid'), column_labels_border_bottom_width=OptionsInfo(scss=True, category='column_labels', type='px', value='2px'), column_labels_border_bottom_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_border_lr_style=OptionsInfo(scss=True, category='column_labels', type='value', value='none'), column_labels_border_lr_width=OptionsInfo(scss=True, category='column_labels', type='px', value='1px'), column_labels_border_lr_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_hidden=OptionsInfo(scss=False, category='column_labels', type='boolean', value=False), row_group_background_color=OptionsInfo(scss=True, category='row_group', type='value', value=None), row_group_font_size=OptionsInfo(scss=True, category='row_group', type='px', value='100%'), row_group_font_weight=OptionsInfo(scss=True, category='row_group', type='value', value='initial'), row_group_text_transform=OptionsInfo(scss=True, category='row_group', type='value', value='inherit'), row_group_padding=OptionsInfo(scss=True, category='row_group', type='px', value='8px'), row_group_padding_horizontal=OptionsInfo(scss=True, category='row_group', type='px', value='5px'), row_group_border_top_style=OptionsInfo(scss=True, category='row_group', type='value', value='solid'), row_group_border_top_width=OptionsInfo(scss=True, category='row_group', type='px', value='2px'), row_group_border_top_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_right_style=OptionsInfo(scss=True, category='row_group', type='value', value='none'), row_group_border_right_width=OptionsInfo(scss=True, category='row_group', type='px', value='1px'), row_group_border_right_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_bottom_style=OptionsInfo(scss=True, category='row_group', type='value', value='solid'), row_group_border_bottom_width=OptionsInfo(scss=True, category='row_group', type='px', value='2px'), row_group_border_bottom_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_left_style=OptionsInfo(scss=True, category='row_group', type='value', value='none'), row_group_border_left_width=OptionsInfo(scss=True, category='row_group', type='px', value='1px'), row_group_border_left_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_as_column=OptionsInfo(scss=False, category='row_group', type='boolean', value=False), table_body_hlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_hlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), table_body_hlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_vlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='none'), table_body_vlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), table_body_vlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_border_top_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_border_top_width=OptionsInfo(scss=True, category='table_body', type='px', value='2px'), table_body_border_top_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_border_bottom_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_border_bottom_width=OptionsInfo(scss=True, category='table_body', type='px', value='2px'), table_body_border_bottom_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), data_row_padding=OptionsInfo(scss=True, category='data_row', type='px', value='8px'), data_row_padding_horizontal=OptionsInfo(scss=True, category='data_row', type='px', value='5px'), stub_background_color=OptionsInfo(scss=True, category='stub', type='value', value=None), stub_font_size=OptionsInfo(scss=True, category='stub', type='px', value='100%'), stub_font_weight=OptionsInfo(scss=True, category='stub', type='value', value='initial'), stub_text_transform=OptionsInfo(scss=True, category='stub', type='value', value='inherit'), stub_border_style=OptionsInfo(scss=True, category='stub', type='value', value='solid'), stub_border_width=OptionsInfo(scss=True, category='stub', type='px', value='2px'), stub_border_color=OptionsInfo(scss=True, category='stub', type='value', value='#D3D3D3'), stub_row_group_background_color=OptionsInfo(scss=True, category='stub', type='value', value=None), stub_row_group_font_size=OptionsInfo(scss=True, category='stub', type='px', value='100%'), stub_row_group_font_weight=OptionsInfo(scss=True, category='stub', type='value', value='initial'), stub_row_group_text_transform=OptionsInfo(scss=True, category='stub', type='value', value='inherit'), stub_row_group_border_style=OptionsInfo(scss=True, category='stub', type='value', value='solid'), stub_row_group_border_width=OptionsInfo(scss=True, category='stub', type='px', value='2px'), stub_row_group_border_color=OptionsInfo(scss=True, category='stub', type='value', value='#D3D3D3'), source_notes_padding=OptionsInfo(scss=True, category='source_notes', type='px', value='4px'), source_notes_padding_horizontal=OptionsInfo(scss=True, category='source_notes', type='px', value='5px'), source_notes_background_color=OptionsInfo(scss=True, category='source_notes', type='value', value=None), source_notes_font_size=OptionsInfo(scss=True, category='source_notes', type='px', value='90%'), source_notes_border_bottom_style=OptionsInfo(scss=True, category='source_notes', type='value', value='none'), source_notes_border_bottom_width=OptionsInfo(scss=True, category='source_notes', type='px', value='2px'), source_notes_border_bottom_color=OptionsInfo(scss=True, category='source_notes', type='value', value='#D3D3D3'), source_notes_border_lr_style=OptionsInfo(scss=True, category='source_notes', type='value', value='none'), source_notes_border_lr_width=OptionsInfo(scss=True, category='source_notes', type='px', value='2px'), source_notes_border_lr_color=OptionsInfo(scss=True, category='source_notes', type='value', value='#D3D3D3'), source_notes_multiline=OptionsInfo(scss=False, category='source_notes', type='boolean', value=True), source_notes_sep=OptionsInfo(scss=False, category='source_notes', type='value', value=' '), row_striping_background_color=OptionsInfo(scss=True, category='row', type='value', value='rgba(128,128,128,0.05)'), row_striping_include_stub=OptionsInfo(scss=False, category='row', type='boolean', value=False), row_striping_include_table_body=OptionsInfo(scss=False, category='row', type='boolean', value=False), container_width=OptionsInfo(scss=False, category='container', type='px', value='auto'), container_height=OptionsInfo(scss=False, category='container', type='px', value='auto'), container_padding_x=OptionsInfo(scss=False, category='container', type='px', value='0px'), container_padding_y=OptionsInfo(scss=False, category='container', type='px', value='10px'), container_overflow_x=OptionsInfo(scss=False, category='container', type='overflow', value='auto'), container_overflow_y=OptionsInfo(scss=False, category='container', type='overflow', value='auto'), quarto_disable_processing=OptionsInfo(scss=False, category='quarto', type='logical', value=False), quarto_use_bootstrap=OptionsInfo(scss=False, category='quarto', type='logical', value=False)), _has_built=False)" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Checks whether nulls in one feature happens at the same time as nulls in other features, or if\n", - "# nulls happen only when some feature is small / high (by sorting the df.)\n", - "# Sort is optional.\n", - "# This first sorts the df in DIA by 'Monthly_Income', then groups every `row_group_size` rows together.\n", - "# E.g. first 10_000 rows become row gorup 1 (the first bin), etc, etc., and then computes the null rate\n", - "# in each row group.\n", - "\n", - "# In this example, we can see that lower monthly income population tend to have high loan amount null rate\n", - "# and that loan amount nulls correspond to loan period nulls (and much more.)\n", - "dia.plot_null_distribution(\n", - " cs.numeric(), \n", - " row_group_size=5_000,\n", - " sort = \"Monthly_Income\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "
Null Distribution
columnpercentages in row groupsnull%total
Employer_Category2
0.01108.0E−309.0E−304.0E−303.0E−300.0117.0E−300.0103.0E−307.0E−307.0E−303.0E−302.0E−3004.0E−306.0E−304.0E−304.0E−303.0E−306.0E−302.0E−304.0E−307.0E−303.0E−306.0E−308.0E−305.0E−304.0E−307.0E−303.0E−306.2E−30
0.52%29812
Monthly_Income
5-5000000000000000000000000000000
0.00%29812
Existing_EMI
1.0E−300000000000000000000000001.0E−30000000
0.00%29812
Loan_Amount
0.4400.390.400.400.400.410.390.400.420.380.420.400.410.420.400.420.410.440.430.410.400.420.380.320.340.330.340.390.440.400.43
39.73%29812
Loan_Period
0.4400.390.400.400.400.410.390.400.420.380.420.400.410.420.400.420.410.440.430.410.400.420.380.320.340.330.340.390.440.400.43
39.73%29812
Interest_Rate
0.7300.680.670.680.710.700.680.680.680.660.680.690.680.690.700.680.690.700.720.720.710.700.710.670.660.670.660.710.720.700.73
69.03%29812
EMI
0.7300.680.670.680.710.700.680.680.680.660.680.690.680.690.700.680.690.700.720.720.710.700.710.670.660.670.660.710.720.700.73
69.03%29812
Var1
5-5000000000000000000000000000000
0.00%29812
Approved
5-5000000000000000000000000000000
0.00%29812
\n", - "\n", - "
\n", - " " - ], - "text/plain": [ - "GT(_tbl_data=shape: (9, 4)\n", - "┌────────────────────┬──────────────────────────────┬──────────┬───────┐\n", - "│ column ┆ percentages in row groups ┆ null% ┆ total │\n", - "│ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ struct[1] ┆ f64 ┆ i32 │\n", - "╞════════════════════╪══════════════════════════════╪══════════╪═══════╡\n", - "│ Employer_Category2 ┆ {[0.008, 0.009, … 0.006158]} ┆ 0.005199 ┆ 29812 │\n", - "│ Monthly_Income ┆ {[0.0, 0.0, … 0.0]} ┆ 0.0 ┆ 29812 │\n", - "│ Existing_EMI ┆ {[0.0, 0.0, … 0.0]} ┆ 0.000034 ┆ 29812 │\n", - "│ Loan_Amount ┆ {[0.389, 0.401, … 0.432266]} ┆ 0.397256 ┆ 29812 │\n", - "│ Loan_Period ┆ {[0.389, 0.401, … 0.432266]} ┆ 0.397256 ┆ 29812 │\n", - "│ Interest_Rate ┆ {[0.678, 0.669, … 0.731527]} ┆ 0.690326 ┆ 29812 │\n", - "│ EMI ┆ {[0.678, 0.669, … 0.731527]} ┆ 0.690326 ┆ 29812 │\n", - "│ Var1 ┆ {[0.0, 0.0, … 0.0]} ┆ 0.0 ┆ 29812 │\n", - "│ Approved ┆ {[0.0, 0.0, … 0.0]} ┆ 0.0 ┆ 29812 │\n", - "└────────────────────┴──────────────────────────────┴──────────┴───────┘, _body=, _boxhead=Boxhead([ColInfo(var='column', type=, column_label='column', column_align='left', column_width=None), ColInfo(var='percentages in row groups', type=, column_label='percentages in row groups', column_align='center', column_width=None), ColInfo(var='null%', type=, column_label='null%', column_align='right', column_width=None), ColInfo(var='total', type=, column_label='total', column_align='right', column_width=None)]), _stub=, _spanners=Spanners([]), _heading=Heading(title='Null Distribution', subtitle=None, preheader=None), _stubhead='column', _source_notes=[], _footnotes=[], _styles=[], _locale=, _formats=[, , ], _substitutions=[], _options=Options(table_id=OptionsInfo(scss=False, category='table', type='value', value=None), table_caption=OptionsInfo(scss=False, category='table', type='value', value=None), table_width=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_layout=OptionsInfo(scss=True, category='table', type='value', value='fixed'), table_margin_left=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_margin_right=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_background_color=OptionsInfo(scss=True, category='table', type='value', value='#FFFFFF'), table_additional_css=OptionsInfo(scss=False, category='table', type='values', value=[]), table_font_names=OptionsInfo(scss=False, category='table', type='values', value=['-apple-system', 'BlinkMacSystemFont', 'Segoe UI', 'Roboto', 'Oxygen', 'Ubuntu', 'Cantarell', 'Helvetica Neue', 'Fira Sans', 'Droid Sans', 'Arial', 'sans-serif']), table_font_size=OptionsInfo(scss=True, category='table', type='px', value='16px'), table_font_weight=OptionsInfo(scss=True, category='table', type='value', value='normal'), table_font_style=OptionsInfo(scss=True, category='table', type='value', value='normal'), table_font_color=OptionsInfo(scss=True, category='table', type='value', value='#333333'), table_font_color_light=OptionsInfo(scss=True, category='table', type='value', value='#FFFFFF'), table_border_top_include=OptionsInfo(scss=False, category='table', type='boolean', value=True), table_border_top_style=OptionsInfo(scss=True, category='table', type='value', value='solid'), table_border_top_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_top_color=OptionsInfo(scss=True, category='table', type='value', value='#A8A8A8'), table_border_right_style=OptionsInfo(scss=True, category='table', type='value', value='none'), table_border_right_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_right_color=OptionsInfo(scss=True, category='table', type='value', value='#D3D3D3'), table_border_bottom_include=OptionsInfo(scss=False, category='table', type='boolean', value=True), table_border_bottom_style=OptionsInfo(scss=True, category='table', type='value', value='solid'), table_border_bottom_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_bottom_color=OptionsInfo(scss=True, category='table', type='value', value='#A8A8A8'), table_border_left_style=OptionsInfo(scss=True, category='table', type='value', value='none'), table_border_left_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_left_color=OptionsInfo(scss=True, category='table', type='value', value='#D3D3D3'), heading_background_color=OptionsInfo(scss=True, category='heading', type='value', value=None), heading_align=OptionsInfo(scss=True, category='heading', type='value', value='center'), heading_title_font_size=OptionsInfo(scss=True, category='heading', type='px', value='125%'), heading_title_font_weight=OptionsInfo(scss=True, category='heading', type='value', value='initial'), heading_subtitle_font_size=OptionsInfo(scss=True, category='heading', type='px', value='85%'), heading_subtitle_font_weight=OptionsInfo(scss=True, category='heading', type='value', value='initial'), heading_padding=OptionsInfo(scss=True, category='heading', type='px', value='4px'), heading_padding_horizontal=OptionsInfo(scss=True, category='heading', type='px', value='5px'), heading_border_bottom_style=OptionsInfo(scss=True, category='heading', type='value', value='solid'), heading_border_bottom_width=OptionsInfo(scss=True, category='heading', type='px', value='2px'), heading_border_bottom_color=OptionsInfo(scss=True, category='heading', type='value', value='#D3D3D3'), heading_border_lr_style=OptionsInfo(scss=True, category='heading', type='value', value='none'), heading_border_lr_width=OptionsInfo(scss=True, category='heading', type='px', value='1px'), heading_border_lr_color=OptionsInfo(scss=True, category='heading', type='value', value='#D3D3D3'), column_labels_background_color=OptionsInfo(scss=True, category='column_labels', type='value', value=None), column_labels_font_size=OptionsInfo(scss=True, category='column_labels', type='px', value='100%'), column_labels_font_weight=OptionsInfo(scss=True, category='column_labels', type='value', value='normal'), column_labels_text_transform=OptionsInfo(scss=True, category='column_labels', type='value', value='inherit'), column_labels_padding=OptionsInfo(scss=True, category='column_labels', type='px', value='5px'), column_labels_padding_horizontal=OptionsInfo(scss=True, category='column_labels', type='px', value='5px'), column_labels_vlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='none'), column_labels_vlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), column_labels_vlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), column_labels_border_top_style=OptionsInfo(scss=True, category='column_labels', type='value', value='solid'), column_labels_border_top_width=OptionsInfo(scss=True, category='column_labels', type='px', value='2px'), column_labels_border_top_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_border_bottom_style=OptionsInfo(scss=True, category='column_labels', type='value', value='solid'), column_labels_border_bottom_width=OptionsInfo(scss=True, category='column_labels', type='px', value='2px'), column_labels_border_bottom_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_border_lr_style=OptionsInfo(scss=True, category='column_labels', type='value', value='none'), column_labels_border_lr_width=OptionsInfo(scss=True, category='column_labels', type='px', value='1px'), column_labels_border_lr_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_hidden=OptionsInfo(scss=False, category='column_labels', type='boolean', value=False), row_group_background_color=OptionsInfo(scss=True, category='row_group', type='value', value=None), row_group_font_size=OptionsInfo(scss=True, category='row_group', type='px', value='100%'), row_group_font_weight=OptionsInfo(scss=True, category='row_group', type='value', value='initial'), row_group_text_transform=OptionsInfo(scss=True, category='row_group', type='value', value='inherit'), row_group_padding=OptionsInfo(scss=True, category='row_group', type='px', value='8px'), row_group_padding_horizontal=OptionsInfo(scss=True, category='row_group', type='px', value='5px'), row_group_border_top_style=OptionsInfo(scss=True, category='row_group', type='value', value='solid'), row_group_border_top_width=OptionsInfo(scss=True, category='row_group', type='px', value='2px'), row_group_border_top_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_right_style=OptionsInfo(scss=True, category='row_group', type='value', value='none'), row_group_border_right_width=OptionsInfo(scss=True, category='row_group', type='px', value='1px'), row_group_border_right_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_bottom_style=OptionsInfo(scss=True, category='row_group', type='value', value='solid'), row_group_border_bottom_width=OptionsInfo(scss=True, category='row_group', type='px', value='2px'), row_group_border_bottom_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_left_style=OptionsInfo(scss=True, category='row_group', type='value', value='none'), row_group_border_left_width=OptionsInfo(scss=True, category='row_group', type='px', value='1px'), row_group_border_left_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_as_column=OptionsInfo(scss=False, category='row_group', type='boolean', value=False), table_body_hlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_hlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), table_body_hlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_vlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='none'), table_body_vlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), table_body_vlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_border_top_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_border_top_width=OptionsInfo(scss=True, category='table_body', type='px', value='2px'), table_body_border_top_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_border_bottom_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_border_bottom_width=OptionsInfo(scss=True, category='table_body', type='px', value='2px'), table_body_border_bottom_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), data_row_padding=OptionsInfo(scss=True, category='data_row', type='px', value='8px'), data_row_padding_horizontal=OptionsInfo(scss=True, category='data_row', type='px', value='5px'), stub_background_color=OptionsInfo(scss=True, category='stub', type='value', value=None), stub_font_size=OptionsInfo(scss=True, category='stub', type='px', value='100%'), stub_font_weight=OptionsInfo(scss=True, category='stub', type='value', value='initial'), stub_text_transform=OptionsInfo(scss=True, category='stub', type='value', value='inherit'), stub_border_style=OptionsInfo(scss=True, category='stub', type='value', value='solid'), stub_border_width=OptionsInfo(scss=True, category='stub', type='px', value='2px'), stub_border_color=OptionsInfo(scss=True, category='stub', type='value', value='#D3D3D3'), stub_row_group_background_color=OptionsInfo(scss=True, category='stub', type='value', value=None), stub_row_group_font_size=OptionsInfo(scss=True, category='stub', type='px', value='100%'), stub_row_group_font_weight=OptionsInfo(scss=True, category='stub', type='value', value='initial'), stub_row_group_text_transform=OptionsInfo(scss=True, category='stub', type='value', value='inherit'), stub_row_group_border_style=OptionsInfo(scss=True, category='stub', type='value', value='solid'), stub_row_group_border_width=OptionsInfo(scss=True, category='stub', type='px', value='2px'), stub_row_group_border_color=OptionsInfo(scss=True, category='stub', type='value', value='#D3D3D3'), source_notes_padding=OptionsInfo(scss=True, category='source_notes', type='px', value='4px'), source_notes_padding_horizontal=OptionsInfo(scss=True, category='source_notes', type='px', value='5px'), source_notes_background_color=OptionsInfo(scss=True, category='source_notes', type='value', value=None), source_notes_font_size=OptionsInfo(scss=True, category='source_notes', type='px', value='90%'), source_notes_border_bottom_style=OptionsInfo(scss=True, category='source_notes', type='value', value='none'), source_notes_border_bottom_width=OptionsInfo(scss=True, category='source_notes', type='px', value='2px'), source_notes_border_bottom_color=OptionsInfo(scss=True, category='source_notes', type='value', value='#D3D3D3'), source_notes_border_lr_style=OptionsInfo(scss=True, category='source_notes', type='value', value='none'), source_notes_border_lr_width=OptionsInfo(scss=True, category='source_notes', type='px', value='2px'), source_notes_border_lr_color=OptionsInfo(scss=True, category='source_notes', type='value', value='#D3D3D3'), source_notes_multiline=OptionsInfo(scss=False, category='source_notes', type='boolean', value=True), source_notes_sep=OptionsInfo(scss=False, category='source_notes', type='value', value=' '), row_striping_background_color=OptionsInfo(scss=True, category='row', type='value', value='rgba(128,128,128,0.05)'), row_striping_include_stub=OptionsInfo(scss=False, category='row', type='boolean', value=False), row_striping_include_table_body=OptionsInfo(scss=False, category='row', type='boolean', value=False), container_width=OptionsInfo(scss=False, category='container', type='px', value='auto'), container_height=OptionsInfo(scss=False, category='container', type='px', value='auto'), container_padding_x=OptionsInfo(scss=False, category='container', type='px', value='0px'), container_padding_y=OptionsInfo(scss=False, category='container', type='px', value='10px'), container_overflow_x=OptionsInfo(scss=False, category='container', type='overflow', value='auto'), container_overflow_y=OptionsInfo(scss=False, category='container', type='overflow', value='auto'), quarto_disable_processing=OptionsInfo(scss=False, category='quarto', type='logical', value=False), quarto_use_bootstrap=OptionsInfo(scss=False, category='quarto', type='logical', value=False)), _has_built=False)" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Additionally you can have a filter that is applied before row group assignments.\n", - "dia.plot_null_distribution(\n", - " cs.numeric(), \n", - " filter_by= (pl.col(\"Source_Category\") == 'B'),\n", - " row_group_size = 1000\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.VConcatChart(...)" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Plot a single feature's distribution, together with useful stats\n", - "df_bins, plot = dia.plot_dist(\n", - " \"EMI\", \n", - " n_bins=100, \n", - " density=False, \n", - ")\n", - "plot" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.VConcatChart(...)" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Plot is an Altair's chart object, so you can do a lot with it (but has some restriction)\n", - "# Here we can turn the plot into an interactive one\n", - "plot.interactive()" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.VConcatChart(...)" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Again, you can provide a filter expression, which will be applied upfront\n", - "# You can see how because of this filter expression, null and extreme values are removed\n", - "df_bins, plot = dia.plot_dist(\n", - " \"EMI\", \n", - " n_bins=100, \n", - " density=False, \n", - " filter_by = pl.col(\"EMI\").is_between(pl.col(\"EMI\").quantile(0.01), pl.col(\"EMI\").quantile(0.99)),\n", - ")\n", - "plot" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.VConcatChart(...)" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dia.compare_dist_on_segment(\n", - " \"EMI\", \n", - " by = \"Primary_Bank_Type\",\n", - " # (pl.col(\"Loan_Amount\") > 10_000).alias(\"high_loan_amount\"), # The segment we want to use\n", - " n_bins=100, \n", - " density=True, \n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'5.5.0'" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import altair\n", - "\n", - "altair.__version__ " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Classic Iris Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (5, 5)
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)species
f64f64f64f64str
5.13.51.40.2"setosa"
4.93.01.40.2"setosa"
4.73.21.30.2"setosa"
4.63.11.50.2"setosa"
5.03.61.40.2"setosa"
" - ], - "text/plain": [ - "shape: (5, 5)\n", - "┌───────────────────┬──────────────────┬───────────────────┬──────────────────┬─────────┐\n", - "│ sepal length (cm) ┆ sepal width (cm) ┆ petal length (cm) ┆ petal width (cm) ┆ species │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str │\n", - "╞═══════════════════╪══════════════════╪═══════════════════╪══════════════════╪═════════╡\n", - "│ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa │\n", - "│ 4.9 ┆ 3.0 ┆ 1.4 ┆ 0.2 ┆ setosa │\n", - "│ 4.7 ┆ 3.2 ┆ 1.3 ┆ 0.2 ┆ setosa │\n", - "│ 4.6 ┆ 3.1 ┆ 1.5 ┆ 0.2 ┆ setosa │\n", - "│ 5.0 ┆ 3.6 ┆ 1.4 ┆ 0.2 ┆ setosa │\n", - "└───────────────────┴──────────────────┴───────────────────┴──────────────────┴─────────┘" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import polars as pl\n", - "import polars_ds as pds\n", - "from polars_ds.diagnosis import DIA\n", - "# Only used to get dataset.\n", - "from sklearn import datasets\n", - "\n", - "\n", - "dataset = datasets.load_iris()\n", - "df = pl.from_numpy(dataset.data, schema = dataset.feature_names).with_columns(\n", - " pl.Series(values=dataset.target).alias(\"species\")\n", - ").with_columns(\n", - " pl.when(pl.col(\"species\") == 0).then(pl.lit('setosa'))\n", - " .when(pl.col(\"species\") == 1).then(pl.lit('versicolor'))\n", - " .when(pl.col(\"species\") == 2).then(pl.lit('virginica')).alias(\"species\")\n", - ")\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dia = DIA(df)\n", - "dia.plot_pca(\n", - " pl.all().exclude(\"species\"), \n", - " by = \"species\",\n", - " dim = 2\n", - ").interactive()" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.LayerChart(...)" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Just for fun, let's see how well can sepal length approximate petal length\n", - "\n", - "plot = dia.plot_lin_reg(\n", - " x = \"sepal length (cm)\", \n", - " target = \"petal length (cm)\", \n", - " add_bias=True,\n", - ")\n", - "plot\n" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.LayerChart(...)" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# The plot is an Altair plot, you can do a lot of cool things from here. For more details, \n", - "# visit Altair's official docs!\n", - "plot.interactive()" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.VConcatChart(...)" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dia.plot_lin_reg(\n", - " x = \"sepal length (cm)\", \n", - " target = \"petal length (cm)\", \n", - " add_bias=True,\n", - " by = \"species\",\n", - " # weights = \"petal width (cm)\" # Optional\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/eda.ipynb b/examples/eda.ipynb new file mode 100644 index 00000000..6fbfc66a --- /dev/null +++ b/examples/eda.ipynb @@ -0,0 +1,2346 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# EDA module\n", + "\n", + "## 1. Diagnosis and DIA (Data Inspection Assistant)\n", + "\n", + "If you cannot import this module, please try: pip install \"polars_ds[plot]\"\n", + "\n", + "The dataset used for dependency detection can be found on github, at examples/dependency.parquet\n", + "\n", + "The plots cannot be rendered on github. Currently, the plot backend is Altair but this is subject\n", + "to change depending on which plotting backend supports Polars more natively." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RendererRegistry.enable('svg')" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import altair as alt # the plotting package used by PDS\n", + "# For display on github. You don't need to do this when testing locally\n", + "alt.renderers.enable(\"svg\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "import polars_ds as pds\n", + "from polars_ds.eda.diagnosis import DIA" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 6)
uniform_1uniform_2expnormalfat_normallist_prob
f64f64f64f64f64list[f64]
2.8957150.5718950.255772-0.847602-247.467743[0.571895, 0.428105]
8.4625750.6570872.0917230.767247511.340602[0.657087, 0.342913]
0.8522440.2431890.5831541.304583-203.913802[0.243189, 0.756811]
11.5839920.8027810.700429-1.183611600.817866[0.802781, 0.197219]
6.1025420.3103022.836993-0.9395621182.884094[0.310302, 0.689698]
" + ], + "text/plain": [ + "shape: (5, 6)\n", + "┌───────────┬───────────┬──────────┬───────────┬─────────────┬──────────────────────┐\n", + "│ uniform_1 ┆ uniform_2 ┆ exp ┆ normal ┆ fat_normal ┆ list_prob │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ list[f64] │\n", + "╞═══════════╪═══════════╪══════════╪═══════════╪═════════════╪══════════════════════╡\n", + "│ 2.895715 ┆ 0.571895 ┆ 0.255772 ┆ -0.847602 ┆ -247.467743 ┆ [0.571895, 0.428105] │\n", + "│ 8.462575 ┆ 0.657087 ┆ 2.091723 ┆ 0.767247 ┆ 511.340602 ┆ [0.657087, 0.342913] │\n", + "│ 0.852244 ┆ 0.243189 ┆ 0.583154 ┆ 1.304583 ┆ -203.913802 ┆ [0.243189, 0.756811] │\n", + "│ 11.583992 ┆ 0.802781 ┆ 0.700429 ┆ -1.183611 ┆ 600.817866 ┆ [0.802781, 0.197219] │\n", + "│ 6.102542 ┆ 0.310302 ┆ 2.836993 ┆ -0.939562 ┆ 1182.884094 ┆ [0.310302, 0.689698] │\n", + "└───────────┴───────────┴──────────┴───────────┴─────────────┴──────────────────────┘" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pds.frame(size=1_000_000).select(\n", + " pds.random(0.0, 12.0).alias(\"uniform_1\"),\n", + " pds.random(0.0, 1.0).alias(\"uniform_2\"),\n", + " pds.random_exp(0.5).alias(\"exp\"),\n", + " pds.random_normal(0.0, 1.0).alias(\"normal\"),\n", + " pds.random_normal(0.0, 1000.0).alias(\"fat_normal\"),\n", + ").with_columns(\n", + " pl.concat_list(\"uniform_2\", 1 - pl.col(\"uniform_2\")).alias(\"list_prob\")\n", + ")\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "dia = DIA(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 7)
columnnull_countnull%NaN_countNaN%inf_countInf%
stru32f64u32f64u32f64
"uniform_1"00.000.000.0
"uniform_2"00.000.000.0
"exp"00.000.000.0
"normal"00.000.000.0
"fat_normal"00.000.000.0
" + ], + "text/plain": [ + "shape: (5, 7)\n", + "┌────────────┬────────────┬───────┬───────────┬──────┬───────────┬──────┐\n", + "│ column ┆ null_count ┆ null% ┆ NaN_count ┆ NaN% ┆ inf_count ┆ Inf% │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ u32 ┆ f64 ┆ u32 ┆ f64 ┆ u32 ┆ f64 │\n", + "╞════════════╪════════════╪═══════╪═══════════╪══════╪═══════════╪══════╡\n", + "│ uniform_1 ┆ 0 ┆ 0.0 ┆ 0 ┆ 0.0 ┆ 0 ┆ 0.0 │\n", + "│ uniform_2 ┆ 0 ┆ 0.0 ┆ 0 ┆ 0.0 ┆ 0 ┆ 0.0 │\n", + "│ exp ┆ 0 ┆ 0.0 ┆ 0 ┆ 0.0 ┆ 0 ┆ 0.0 │\n", + "│ normal ┆ 0 ┆ 0.0 ┆ 0 ┆ 0.0 ┆ 0 ┆ 0.0 │\n", + "│ fat_normal ┆ 0 ┆ 0.0 ┆ 0 ┆ 0.0 ┆ 0 ┆ 0.0 │\n", + "└────────────┴────────────┴───────┴───────────┴──────┴───────────┴──────┘" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dia.special_values_report()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "
columnnon_null_cntnull%meanstdminq1medianq3maxIQRoutlier_cnthistogram
uniform_110000000.00%5.9983.4640.0002.9996.0008.99612.0005.9960
50.4K050.3K49.8K49.8K49.9K50.3K50.1K50.2K50.4K49.8K49.4K49.9K50.1K49.9K50.3K50.1K49.9K49.9K49.7K49.9K50.2K
uniform_210000000.00%0.5010.2890.0000.2510.5000.7511.0000.5000
50.6K049.8K49.7K49.7K50.2K50.2K49.6K50.1K49.8K49.8K50.0K50.1K49.7K50.0K50.6K49.9K50.3K50.4K49.9K50.0K50.3K
exp10000000.00%1.9991.9960.0000.5771.3902.76827.8692.19148068
502K0502K251K124K62.3K30.9K15.2K7.55K3.78K1.89K9614702261226638157211
normal10000000.00%0.0020.999−5.122−0.6730.0000.6745.1411.3477012
197K03191628684.17K15.2K42.0K91.5K153K197K196K150K89.2K41.3K14.6K4.04K836154232
fat_normal10000000.00%0.8931,000.159−4,780.726−674.0741.700677.1385,081.8371,351.2126841
194K010713882.01K7.74K23.9K58.0K110K164K194K180K133K76.7K34.7K12.3K3.39K834150174
\n", + "\n", + "
\n", + " " + ], + "text/plain": [ + "GT(_tbl_data=shape: (5, 13)\n", + "┌────────────┬────────────┬───────┬──────────┬───┬────────────┬────────────┬───────────┬───────────┐\n", + "│ column ┆ non_null_c ┆ null% ┆ mean ┆ … ┆ max ┆ IQR ┆ outlier_c ┆ histogram │\n", + "│ --- ┆ nt ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ nt ┆ --- │\n", + "│ str ┆ --- ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ --- ┆ struct[1] │\n", + "│ ┆ u32 ┆ ┆ ┆ ┆ ┆ ┆ u32 ┆ │\n", + "╞════════════╪════════════╪═══════╪══════════╪═══╪════════════╪════════════╪═══════════╪═══════════╡\n", + "│ uniform_1 ┆ 1000000 ┆ 0.0 ┆ 5.997996 ┆ … ┆ 11.999997 ┆ 5.996472 ┆ 0 ┆ {[50288, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 49770, … │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 50158]} │\n", + "│ uniform_2 ┆ 1000000 ┆ 0.0 ┆ 0.500747 ┆ … ┆ 1.0 ┆ 0.500303 ┆ 0 ┆ {[49763, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 49659, … │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 50286]} │\n", + "│ exp ┆ 1000000 ┆ 0.0 ┆ 1.998922 ┆ … ┆ 27.868522 ┆ 2.191431 ┆ 48068 ┆ {[501500, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 250841, … │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 1]} │\n", + "│ normal ┆ 1000000 ┆ 0.0 ┆ 0.001572 ┆ … ┆ 5.141155 ┆ 1.346717 ┆ 7012 ┆ {[3, 19, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 2]} │\n", + "│ fat_normal ┆ 1000000 ┆ 0.0 ┆ 0.893202 ┆ … ┆ 5081.83727 ┆ 1351.21172 ┆ 6841 ┆ {[10, 71, │\n", + "│ ┆ ┆ ┆ ┆ ┆ 6 ┆ 8 ┆ ┆ … 4]} │\n", + "└────────────┴────────────┴───────┴──────────┴───┴────────────┴────────────┴───────────┴───────────┘, _body=, _boxhead=Boxhead([ColInfo(var='column', type=, column_label='column', column_align='left', column_width=None), ColInfo(var='non_null_cnt', type=, column_label='non_null_cnt', column_align='center', column_width=None), ColInfo(var='null%', type=, column_label='null%', column_align='right', column_width=None), ColInfo(var='mean', type=, column_label='mean', column_align='right', column_width=None), ColInfo(var='std', type=, column_label='std', column_align='right', column_width=None), ColInfo(var='min', type=, column_label='min', column_align='right', column_width=None), ColInfo(var='q1', type=, column_label='q1', column_align='right', column_width=None), ColInfo(var='median', type=, column_label='median', column_align='right', column_width=None), ColInfo(var='q3', type=, column_label='q3', column_align='right', column_width=None), ColInfo(var='max', type=, column_label='max', column_align='right', column_width=None), ColInfo(var='IQR', type=, column_label='IQR', column_align='right', column_width=None), ColInfo(var='outlier_cnt', type=, column_label='outlier_cnt', column_align='center', column_width=None), ColInfo(var='histogram', type=, column_label='histogram', column_align='center', column_width=None)]), _stub=, _spanners=Spanners([]), _heading=Heading(title=None, subtitle=None, preheader=None), _stubhead='column', _source_notes=[], _footnotes=[], _styles=[], _locale=, _formats=[, , ], _substitutions=[], _options=Options(table_id=OptionsInfo(scss=False, category='table', type='value', value=None), table_caption=OptionsInfo(scss=False, category='table', type='value', value=None), table_width=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_layout=OptionsInfo(scss=True, category='table', type='value', value='fixed'), table_margin_left=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_margin_right=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_background_color=OptionsInfo(scss=True, category='table', type='value', value='#FFFFFF'), table_additional_css=OptionsInfo(scss=False, category='table', type='values', value=[]), table_font_names=OptionsInfo(scss=False, category='table', type='values', value=['-apple-system', 'BlinkMacSystemFont', 'Segoe UI', 'Roboto', 'Oxygen', 'Ubuntu', 'Cantarell', 'Helvetica Neue', 'Fira Sans', 'Droid Sans', 'Arial', 'sans-serif']), table_font_size=OptionsInfo(scss=True, category='table', type='px', value='16px'), table_font_weight=OptionsInfo(scss=True, category='table', type='value', value='normal'), table_font_style=OptionsInfo(scss=True, category='table', type='value', value='normal'), table_font_color=OptionsInfo(scss=True, category='table', type='value', value='#333333'), table_font_color_light=OptionsInfo(scss=True, category='table', type='value', value='#FFFFFF'), table_border_top_include=OptionsInfo(scss=False, category='table', type='boolean', value=True), table_border_top_style=OptionsInfo(scss=True, category='table', type='value', value='solid'), table_border_top_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_top_color=OptionsInfo(scss=True, category='table', type='value', value='#A8A8A8'), table_border_right_style=OptionsInfo(scss=True, category='table', type='value', value='none'), table_border_right_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_right_color=OptionsInfo(scss=True, category='table', type='value', value='#D3D3D3'), table_border_bottom_include=OptionsInfo(scss=False, category='table', type='boolean', value=True), table_border_bottom_style=OptionsInfo(scss=True, category='table', type='value', value='solid'), table_border_bottom_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_bottom_color=OptionsInfo(scss=True, category='table', type='value', value='#A8A8A8'), table_border_left_style=OptionsInfo(scss=True, category='table', type='value', value='none'), table_border_left_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_left_color=OptionsInfo(scss=True, category='table', type='value', value='#D3D3D3'), heading_background_color=OptionsInfo(scss=True, category='heading', type='value', value=None), heading_align=OptionsInfo(scss=True, category='heading', type='value', value='center'), heading_title_font_size=OptionsInfo(scss=True, category='heading', type='px', value='125%'), heading_title_font_weight=OptionsInfo(scss=True, category='heading', type='value', value='initial'), heading_subtitle_font_size=OptionsInfo(scss=True, category='heading', type='px', value='85%'), heading_subtitle_font_weight=OptionsInfo(scss=True, category='heading', type='value', value='initial'), heading_padding=OptionsInfo(scss=True, category='heading', type='px', value='4px'), heading_padding_horizontal=OptionsInfo(scss=True, category='heading', type='px', value='5px'), heading_border_bottom_style=OptionsInfo(scss=True, category='heading', type='value', value='solid'), heading_border_bottom_width=OptionsInfo(scss=True, category='heading', type='px', value='2px'), heading_border_bottom_color=OptionsInfo(scss=True, category='heading', type='value', value='#D3D3D3'), heading_border_lr_style=OptionsInfo(scss=True, category='heading', type='value', value='none'), heading_border_lr_width=OptionsInfo(scss=True, category='heading', type='px', value='1px'), heading_border_lr_color=OptionsInfo(scss=True, category='heading', type='value', value='#D3D3D3'), column_labels_background_color=OptionsInfo(scss=True, category='column_labels', type='value', value=None), column_labels_font_size=OptionsInfo(scss=True, category='column_labels', type='px', value='100%'), column_labels_font_weight=OptionsInfo(scss=True, category='column_labels', type='value', value='normal'), column_labels_text_transform=OptionsInfo(scss=True, category='column_labels', type='value', value='inherit'), column_labels_padding=OptionsInfo(scss=True, category='column_labels', type='px', value='5px'), column_labels_padding_horizontal=OptionsInfo(scss=True, category='column_labels', type='px', value='5px'), column_labels_vlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='none'), column_labels_vlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), column_labels_vlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), column_labels_border_top_style=OptionsInfo(scss=True, category='column_labels', type='value', value='solid'), column_labels_border_top_width=OptionsInfo(scss=True, category='column_labels', type='px', value='2px'), column_labels_border_top_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_border_bottom_style=OptionsInfo(scss=True, category='column_labels', type='value', value='solid'), column_labels_border_bottom_width=OptionsInfo(scss=True, category='column_labels', type='px', value='2px'), column_labels_border_bottom_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_border_lr_style=OptionsInfo(scss=True, category='column_labels', type='value', value='none'), column_labels_border_lr_width=OptionsInfo(scss=True, category='column_labels', type='px', value='1px'), column_labels_border_lr_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_hidden=OptionsInfo(scss=False, category='column_labels', type='boolean', value=False), row_group_background_color=OptionsInfo(scss=True, category='row_group', type='value', value=None), row_group_font_size=OptionsInfo(scss=True, category='row_group', type='px', value='100%'), row_group_font_weight=OptionsInfo(scss=True, category='row_group', type='value', value='initial'), row_group_text_transform=OptionsInfo(scss=True, category='row_group', type='value', value='inherit'), row_group_padding=OptionsInfo(scss=True, category='row_group', type='px', value='8px'), row_group_padding_horizontal=OptionsInfo(scss=True, category='row_group', type='px', value='5px'), row_group_border_top_style=OptionsInfo(scss=True, category='row_group', type='value', value='solid'), row_group_border_top_width=OptionsInfo(scss=True, category='row_group', type='px', value='2px'), row_group_border_top_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_right_style=OptionsInfo(scss=True, category='row_group', type='value', value='none'), row_group_border_right_width=OptionsInfo(scss=True, category='row_group', type='px', value='1px'), row_group_border_right_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_bottom_style=OptionsInfo(scss=True, category='row_group', type='value', value='solid'), row_group_border_bottom_width=OptionsInfo(scss=True, category='row_group', type='px', value='2px'), row_group_border_bottom_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_left_style=OptionsInfo(scss=True, category='row_group', type='value', value='none'), row_group_border_left_width=OptionsInfo(scss=True, category='row_group', type='px', value='1px'), row_group_border_left_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_as_column=OptionsInfo(scss=False, category='row_group', type='boolean', value=False), table_body_hlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_hlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), table_body_hlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_vlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='none'), table_body_vlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), table_body_vlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_border_top_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_border_top_width=OptionsInfo(scss=True, category='table_body', type='px', value='2px'), table_body_border_top_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_border_bottom_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_border_bottom_width=OptionsInfo(scss=True, category='table_body', type='px', value='2px'), table_body_border_bottom_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), data_row_padding=OptionsInfo(scss=True, category='data_row', type='px', value='8px'), data_row_padding_horizontal=OptionsInfo(scss=True, category='data_row', type='px', value='5px'), stub_background_color=OptionsInfo(scss=True, category='stub', type='value', value=None), stub_font_size=OptionsInfo(scss=True, category='stub', type='px', value='100%'), stub_font_weight=OptionsInfo(scss=True, category='stub', type='value', value='initial'), stub_text_transform=OptionsInfo(scss=True, category='stub', type='value', value='inherit'), stub_border_style=OptionsInfo(scss=True, category='stub', type='value', value='solid'), stub_border_width=OptionsInfo(scss=True, category='stub', type='px', value='2px'), stub_border_color=OptionsInfo(scss=True, category='stub', type='value', value='#D3D3D3'), stub_row_group_background_color=OptionsInfo(scss=True, category='stub', type='value', value=None), stub_row_group_font_size=OptionsInfo(scss=True, category='stub', type='px', value='100%'), stub_row_group_font_weight=OptionsInfo(scss=True, category='stub', type='value', value='initial'), stub_row_group_text_transform=OptionsInfo(scss=True, category='stub', type='value', value='inherit'), stub_row_group_border_style=OptionsInfo(scss=True, category='stub', type='value', value='solid'), stub_row_group_border_width=OptionsInfo(scss=True, category='stub', type='px', value='2px'), stub_row_group_border_color=OptionsInfo(scss=True, category='stub', type='value', value='#D3D3D3'), source_notes_padding=OptionsInfo(scss=True, category='source_notes', type='px', value='4px'), source_notes_padding_horizontal=OptionsInfo(scss=True, category='source_notes', type='px', value='5px'), source_notes_background_color=OptionsInfo(scss=True, category='source_notes', type='value', value=None), source_notes_font_size=OptionsInfo(scss=True, category='source_notes', type='px', value='90%'), source_notes_border_bottom_style=OptionsInfo(scss=True, category='source_notes', type='value', value='none'), source_notes_border_bottom_width=OptionsInfo(scss=True, category='source_notes', type='px', value='2px'), source_notes_border_bottom_color=OptionsInfo(scss=True, category='source_notes', type='value', value='#D3D3D3'), source_notes_border_lr_style=OptionsInfo(scss=True, category='source_notes', type='value', value='none'), source_notes_border_lr_width=OptionsInfo(scss=True, category='source_notes', type='px', value='2px'), source_notes_border_lr_color=OptionsInfo(scss=True, category='source_notes', type='value', value='#D3D3D3'), source_notes_multiline=OptionsInfo(scss=False, category='source_notes', type='boolean', value=True), source_notes_sep=OptionsInfo(scss=False, category='source_notes', type='value', value=' '), row_striping_background_color=OptionsInfo(scss=True, category='row', type='value', value='rgba(128,128,128,0.05)'), row_striping_include_stub=OptionsInfo(scss=False, category='row', type='boolean', value=False), row_striping_include_table_body=OptionsInfo(scss=False, category='row', type='boolean', value=False), container_width=OptionsInfo(scss=False, category='container', type='px', value='auto'), container_height=OptionsInfo(scss=False, category='container', type='px', value='auto'), container_padding_x=OptionsInfo(scss=False, category='container', type='px', value='0px'), container_padding_y=OptionsInfo(scss=False, category='container', type='px', value='10px'), container_overflow_x=OptionsInfo(scss=False, category='container', type='overflow', value='auto'), container_overflow_y=OptionsInfo(scss=False, category='container', type='overflow', value='auto'), quarto_disable_processing=OptionsInfo(scss=False, category='quarto', type='logical', value=False), quarto_use_bootstrap=OptionsInfo(scss=False, category='quarto', type='logical', value=False)), _has_built=False)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Only shows for numerical columns\n", + "dia.numeric_profile(histogram=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 12)
columnnon_null_cntnull%meanstdminq1medianq3maxIQRoutlier_cnt
stru32f64f64f64f64f64f64f64f64f64u32
"uniform_1"10000000.05.9979963.4638450.0000032.9991556.08.99562711.9999975.9964720
"uniform_2"10000000.00.5007470.288590.0000020.2505430.50.7508461.00.5003030
"exp"10000000.01.9989221.9963062.9467e-70.5769951.392.76842627.8685222.19143148068
"normal"10000000.00.0015720.999155-5.122434-0.672590.00.6741275.1411551.3467177012
"fat_normal"10000000.00.8932021000.158613-4780.725688-674.0738041.7677.1379245081.8372761351.2117286841
" + ], + "text/plain": [ + "shape: (5, 12)\n", + "┌────────────┬────────────┬───────┬──────────┬───┬────────────┬────────────┬───────────┬───────────┐\n", + "│ column ┆ non_null_c ┆ null% ┆ mean ┆ … ┆ q3 ┆ max ┆ IQR ┆ outlier_c │\n", + "│ --- ┆ nt ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ nt │\n", + "│ str ┆ --- ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ --- │\n", + "│ ┆ u32 ┆ ┆ ┆ ┆ ┆ ┆ ┆ u32 │\n", + "╞════════════╪════════════╪═══════╪══════════╪═══╪════════════╪════════════╪═══════════╪═══════════╡\n", + "│ uniform_1 ┆ 1000000 ┆ 0.0 ┆ 5.997996 ┆ … ┆ 8.995627 ┆ 11.999997 ┆ 5.996472 ┆ 0 │\n", + "│ uniform_2 ┆ 1000000 ┆ 0.0 ┆ 0.500747 ┆ … ┆ 0.750846 ┆ 1.0 ┆ 0.500303 ┆ 0 │\n", + "│ exp ┆ 1000000 ┆ 0.0 ┆ 1.998922 ┆ … ┆ 2.768426 ┆ 27.868522 ┆ 2.191431 ┆ 48068 │\n", + "│ normal ┆ 1000000 ┆ 0.0 ┆ 0.001572 ┆ … ┆ 0.674127 ┆ 5.141155 ┆ 1.346717 ┆ 7012 │\n", + "│ fat_normal ┆ 1000000 ┆ 0.0 ┆ 0.893202 ┆ … ┆ 677.137924 ┆ 5081.83727 ┆ 1351.2117 ┆ 6841 │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ 6 ┆ 28 ┆ │\n", + "└────────────┴────────────┴───────┴──────────┴───┴────────────┴────────────┴───────────┴───────────┘" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Don't compute histogram. Use Polars as output format instead of GT\n", + "dia.numeric_profile(histogram=False, gt=False) " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (10, 3)\n", + "┌───────────┬────────────┬───────────┐\n", + "│ x ┆ y ┆ corr │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ f64 │\n", + "╞═══════════╪════════════╪═══════════╡\n", + "│ uniform_2 ┆ exp ┆ 0.00202 │\n", + "│ exp ┆ normal ┆ -0.001338 │\n", + "│ uniform_1 ┆ fat_normal ┆ -0.001127 │\n", + "│ uniform_1 ┆ uniform_2 ┆ 0.001001 │\n", + "│ exp ┆ fat_normal ┆ 0.000832 │\n", + "│ uniform_2 ┆ fat_normal ┆ 0.000664 │\n", + "│ normal ┆ fat_normal ┆ -0.000436 │\n", + "│ uniform_1 ┆ normal ┆ -0.000333 │\n", + "│ uniform_2 ┆ normal ┆ -0.000265 │\n", + "│ uniform_1 ┆ exp ┆ 0.000132 │\n", + "└───────────┴────────────┴───────────┘\n", + "shape: (10, 3)\n", + "┌───────────┬────────────┬───────────┐\n", + "│ x ┆ y ┆ corr │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ f64 │\n", + "╞═══════════╪════════════╪═══════════╡\n", + "│ uniform_2 ┆ exp ┆ 0.000924 │\n", + "│ exp ┆ normal ┆ -0.000699 │\n", + "│ uniform_1 ┆ uniform_2 ┆ 0.000667 │\n", + "│ uniform_2 ┆ fat_normal ┆ 0.000631 │\n", + "│ uniform_1 ┆ fat_normal ┆ -0.000621 │\n", + "│ exp ┆ fat_normal ┆ 0.000448 │\n", + "│ uniform_1 ┆ normal ┆ -0.000287 │\n", + "│ normal ┆ fat_normal ┆ -0.000231 │\n", + "│ uniform_2 ┆ normal ┆ -0.000225 │\n", + "│ uniform_1 ┆ exp ┆ -0.000156 │\n", + "└───────────┴────────────┴───────────┘\n", + "shape: (10, 3)\n", + "┌───────────┬────────────┬───────────┐\n", + "│ x ┆ y ┆ corr │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ f64 │\n", + "╞═══════════╪════════════╪═══════════╡\n", + "│ uniform_2 ┆ exp ┆ 0.001386 │\n", + "│ exp ┆ normal ┆ -0.001048 │\n", + "│ uniform_1 ┆ uniform_2 ┆ 0.001001 │\n", + "│ uniform_2 ┆ fat_normal ┆ 0.000945 │\n", + "│ uniform_1 ┆ fat_normal ┆ -0.000931 │\n", + "│ exp ┆ fat_normal ┆ 0.000673 │\n", + "│ uniform_1 ┆ normal ┆ -0.000431 │\n", + "│ normal ┆ fat_normal ┆ -0.000346 │\n", + "│ uniform_2 ┆ normal ┆ -0.000339 │\n", + "│ uniform_1 ┆ exp ┆ -0.000235 │\n", + "└───────────┴────────────┴───────────┘\n" + ] + } + ], + "source": [ + "print(dia.infer_corr())\n", + "print(dia.infer_corr(method = \"kendall\"))\n", + "print(dia.infer_corr(method = \"spearman\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'numerics': ['uniform_1', 'uniform_2', 'exp', 'normal', 'fat_normal'],\n", + " 'ints': [],\n", + " 'floats': ['uniform_1', 'uniform_2', 'exp', 'normal', 'fat_normal'],\n", + " 'strs': [],\n", + " 'bools': [],\n", + " 'cats': [],\n", + " 'list_floats': ['list_prob'],\n", + " 'list_bool': [],\n", + " 'list_str': [],\n", + " 'list_ints': [],\n", + " 'simple_types': ['uniform_1',\n", + " 'uniform_2',\n", + " 'exp',\n", + " 'normal',\n", + " 'fat_normal',\n", + " 'list_prob'],\n", + " 'other_types': []}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dia.meta()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['uniform_2', 'list_prob']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Uniform_2 can potentially be a probability score column (e.g. output of predict_proba, but taking values only for class =1)\n", + "# list_prob can potentially be a 2-class probability column (e.g. output of predict_proba)\n", + "dia.infer_prob()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Dependency Detection, Distribution Comparisons\n", + "\n", + "Does knowing values in column A tell us values in column B?" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 22)
IDGenderDOBLead_Creation_DateCity_CodeCity_CategoryEmployer_CodeEmployer_Category1Employer_Category2Monthly_IncomeCustomer_Existing_Primary_Bank_CodePrimary_Bank_TypeContactedSourceSource_CategoryExisting_EMILoan_AmountLoan_PeriodInterest_RateEMIVar1Approved
strstrstrstrstrstrstrstri64f64strstrstrstrstrf64i64i64f64i64i64i64
"APPC90493171225""Female""23/07/79""15/07/16""C10001""A""COM0044082""A"42000.0"B001""P""N""S122""G"0.0nullnullnullnull00
"APPD40611263344""Male""07/12/86""04/07/16""C10003""A""COM0000002""C"13500.0"B002""P""Y""S122""G"0.020000213.25953100
"APPE70289249423""Male""10/12/82""19/07/16""C10125""C""COM0005267""C"42250.0"B003""G""Y""S143""B"0.0450004nullnull00
"APPF80273865537""Male""30/01/89""09/07/16""C10477""C""COM0004143""A"43500.0"B003""G""Y""S143""B"0.0920005nullnull70
"APPG60994436641""Male""19/04/85""20/07/16""C10002""A""COM0001781""A"410000.0"B001""P""Y""S134""B"2500.0500002nullnull100
" + ], + "text/plain": [ + "shape: (5, 22)\n", + "┌────────────────┬────────┬──────────┬────────────────┬───┬───────────────┬──────┬──────┬──────────┐\n", + "│ ID ┆ Gender ┆ DOB ┆ Lead_Creation_ ┆ … ┆ Interest_Rate ┆ EMI ┆ Var1 ┆ Approved │\n", + "│ --- ┆ --- ┆ --- ┆ Date ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ str ┆ --- ┆ ┆ f64 ┆ i64 ┆ i64 ┆ i64 │\n", + "│ ┆ ┆ ┆ str ┆ ┆ ┆ ┆ ┆ │\n", + "╞════════════════╪════════╪══════════╪════════════════╪═══╪═══════════════╪══════╪══════╪══════════╡\n", + "│ APPC9049317122 ┆ Female ┆ 23/07/79 ┆ 15/07/16 ┆ … ┆ null ┆ null ┆ 0 ┆ 0 │\n", + "│ 5 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ APPD4061126334 ┆ Male ┆ 07/12/86 ┆ 04/07/16 ┆ … ┆ 13.25 ┆ 953 ┆ 10 ┆ 0 │\n", + "│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ APPE7028924942 ┆ Male ┆ 10/12/82 ┆ 19/07/16 ┆ … ┆ null ┆ null ┆ 0 ┆ 0 │\n", + "│ 3 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ APPF8027386553 ┆ Male ┆ 30/01/89 ┆ 09/07/16 ┆ … ┆ null ┆ null ┆ 7 ┆ 0 │\n", + "│ 7 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ APPG6099443664 ┆ Male ┆ 19/04/85 ┆ 20/07/16 ┆ … ┆ null ┆ null ┆ 10 ┆ 0 │\n", + "│ 1 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "└────────────────┴────────┴──────────┴────────────────┴───┴───────────────┴──────┴──────┴──────────┘" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pl.read_parquet(\"dependency.parquet\")\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(69713, 22)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "dia = DIA(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_15913/3037619369.py:1: UserWarning: The following columns are dropped because they cannot be used in dependency detection: ['Monthly_Income', 'Existing_EMI', 'Interest_Rate']\n", + " dia.infer_dependency()\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (171, 3)
columnbycond_entropy
strstrf64
"Gender""ID"0.0
"Contacted""ID"0.0
"Approved""ID"0.0
"Primary_Bank_Type""Customer_Existing_Primary_Bank…0.0
"Primary_Bank_Type""ID"0.0
"Loan_Amount""City_Code"2.702889
"City_Code""EMI"3.147327
"Lead_Creation_Date""EMI"3.92818
"Lead_Creation_Date""City_Code"4.204907
"Lead_Creation_Date""Loan_Amount"4.336805
" + ], + "text/plain": [ + "shape: (171, 3)\n", + "┌────────────────────┬─────────────────────────────────┬──────────────┐\n", + "│ column ┆ by ┆ cond_entropy │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ f64 │\n", + "╞════════════════════╪═════════════════════════════════╪══════════════╡\n", + "│ Gender ┆ ID ┆ 0.0 │\n", + "│ Contacted ┆ ID ┆ 0.0 │\n", + "│ Approved ┆ ID ┆ 0.0 │\n", + "│ Primary_Bank_Type ┆ Customer_Existing_Primary_Bank… ┆ 0.0 │\n", + "│ Primary_Bank_Type ┆ ID ┆ 0.0 │\n", + "│ … ┆ … ┆ … │\n", + "│ Loan_Amount ┆ City_Code ┆ 2.702889 │\n", + "│ City_Code ┆ EMI ┆ 3.147327 │\n", + "│ Lead_Creation_Date ┆ EMI ┆ 3.92818 │\n", + "│ Lead_Creation_Date ┆ City_Code ┆ 4.204907 │\n", + "│ Lead_Creation_Date ┆ Loan_Amount ┆ 4.336805 │\n", + "└────────────────────┴─────────────────────────────────┴──────────────┘" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dia.infer_dependency()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/tq/Projects/polars_ds_extension/python/polars_ds/eda/diagnosis.py:689: UserWarning: The following columns are dropped because they cannot be used in dependency detection: ['Monthly_Income', 'Existing_EMI', 'Interest_Rate']\n", + " dep_frame = self.infer_dependency(subset=subset)\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Dependency Plot\n", + "\n", + "\n", + "\n", + "Gender\n", + "\n", + "Gender\n", + "\n", + "\n", + "\n", + "ID\n", + "\n", + "ID\n", + "\n", + "\n", + "\n", + "ID->Gender\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "DOB\n", + "\n", + "DOB\n", + "\n", + "\n", + "\n", + "ID->DOB\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "EMI\n", + "\n", + "EMI\n", + "\n", + "\n", + "\n", + "ID->EMI\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Employer_Code\n", + "\n", + "Employer_Code\n", + "\n", + "\n", + "\n", + "ID->Employer_Code\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Loan_Amount\n", + "\n", + "Loan_Amount\n", + "\n", + "\n", + "\n", + "ID->Loan_Amount\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Contacted\n", + "\n", + "Contacted\n", + "\n", + "\n", + "\n", + "ID->Contacted\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Var1\n", + "\n", + "Var1\n", + "\n", + "\n", + "\n", + "ID->Var1\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "City_Code\n", + "\n", + "City_Code\n", + "\n", + "\n", + "\n", + "ID->City_Code\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Source_Category\n", + "\n", + "Source_Category\n", + "\n", + "\n", + "\n", + "ID->Source_Category\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Source\n", + "\n", + "Source\n", + "\n", + "\n", + "\n", + "ID->Source\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Loan_Period\n", + "\n", + "Loan_Period\n", + "\n", + "\n", + "\n", + "ID->Loan_Period\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Lead_Creation_Date\n", + "\n", + "Lead_Creation_Date\n", + "\n", + "\n", + "\n", + "ID->Lead_Creation_Date\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Approved\n", + "\n", + "Approved\n", + "\n", + "\n", + "\n", + "ID->Approved\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Customer_Existing_Primary_Bank_Code\n", + "\n", + "Customer_Existing_Primary_Bank_Code\n", + "\n", + "\n", + "\n", + "ID->Customer_Existing_Primary_Bank_Code\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Employer_Category2\n", + "\n", + "Employer_Category2\n", + "\n", + "\n", + "\n", + "Employer_Code->Employer_Category2\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Employer_Category1\n", + "\n", + "Employer_Category1\n", + "\n", + "\n", + "\n", + "Employer_Code->Employer_Category1\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "City_Category\n", + "\n", + "City_Category\n", + "\n", + "\n", + "\n", + "City_Code->City_Category\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Primary_Bank_Type\n", + "\n", + "Primary_Bank_Type\n", + "\n", + "\n", + "\n", + "Customer_Existing_Primary_Bank_Code->Primary_Bank_Type\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dia.plot_dependency()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/tq/Projects/polars_ds_extension/python/polars_ds/eda/diagnosis.py:689: UserWarning: The following columns are dropped because they cannot be used in dependency detection: ['Monthly_Income', 'Existing_EMI', 'Interest_Rate']\n", + " dep_frame = self.infer_dependency(subset=subset)\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Dependency Plot\n", + "\n", + "\n", + "\n", + "Primary_Bank_Type\n", + "\n", + "Primary_Bank_Type\n", + "\n", + "\n", + "\n", + "Customer_Existing_Primary_Bank_Code\n", + "\n", + "Customer_Existing_Primary_Bank_Code\n", + "\n", + "\n", + "\n", + "Customer_Existing_Primary_Bank_Code->Primary_Bank_Type\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "City_Category\n", + "\n", + "City_Category\n", + "\n", + "\n", + "\n", + "City_Code\n", + "\n", + "City_Code\n", + "\n", + "\n", + "\n", + "City_Code->City_Category\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Employer_Category2\n", + "\n", + "Employer_Category2\n", + "\n", + "\n", + "\n", + "Employer_Code\n", + "\n", + "Employer_Code\n", + "\n", + "\n", + "\n", + "Employer_Code->Employer_Category2\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Employer_Category1\n", + "\n", + "Employer_Category1\n", + "\n", + "\n", + "\n", + "Employer_Code->Employer_Category1\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# ID implies everything, of course, because ID is unique.\n", + "# So let's not plot it\n", + "dia.plot_dependency(subset=pl.all().exclude(\"ID\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (13, 13)
columnnull_countn_uniquemost_freqmost_freq_cntmin_byte_lenmin_char_lenavg_byte_lenavg_char_lenmax_byte_lenmax_char_len5p_byte_len95p_byte_len
stru32u32stru32u32u32f64f64u32u32f64f64
"ID"069713"APPC90493171225"1151515.015.0151515.015.0
"Gender"02"Male"39949444.8539014.853901664.06.0
"DOB"1510760"11/01/82"253888.08.0888.08.0
"Lead_Creation_Date"092"02/09/16"1838888.08.0888.08.0
"City_Code"814679"C10001"10007666.06.0666.06.0
"Customer_Existing_Primary_Bank…939158"B001"14197444.04.0444.04.0
"Primary_Bank_Type"93913"P"39619111.01.0111.01.0
"Contacted"02"Y"45275111.01.0111.01.0
"Source"029"S122"30941444.04.0444.04.0
"Source_Category"07"B"29812111.01.0111.01.0
" + ], + "text/plain": [ + "shape: (13, 13)\n", + "┌───────────┬───────────┬──────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐\n", + "│ column ┆ null_coun ┆ n_unique ┆ most_freq ┆ … ┆ max_byte_ ┆ max_char_ ┆ 5p_byte_l ┆ 95p_byte_ │\n", + "│ --- ┆ t ┆ --- ┆ --- ┆ ┆ len ┆ len ┆ en ┆ len │\n", + "│ str ┆ --- ┆ u32 ┆ str ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ ┆ u32 ┆ ┆ ┆ ┆ u32 ┆ u32 ┆ f64 ┆ f64 │\n", + "╞═══════════╪═══════════╪══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡\n", + "│ ID ┆ 0 ┆ 69713 ┆ APPC90493 ┆ … ┆ 15 ┆ 15 ┆ 15.0 ┆ 15.0 │\n", + "│ ┆ ┆ ┆ 171225 ┆ ┆ ┆ ┆ ┆ │\n", + "│ Gender ┆ 0 ┆ 2 ┆ Male ┆ … ┆ 6 ┆ 6 ┆ 4.0 ┆ 6.0 │\n", + "│ DOB ┆ 15 ┆ 10760 ┆ 11/01/82 ┆ … ┆ 8 ┆ 8 ┆ 8.0 ┆ 8.0 │\n", + "│ Lead_Crea ┆ 0 ┆ 92 ┆ 02/09/16 ┆ … ┆ 8 ┆ 8 ┆ 8.0 ┆ 8.0 │\n", + "│ tion_Date ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ City_Code ┆ 814 ┆ 679 ┆ C10001 ┆ … ┆ 6 ┆ 6 ┆ 6.0 ┆ 6.0 │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ Customer_ ┆ 9391 ┆ 58 ┆ B001 ┆ … ┆ 4 ┆ 4 ┆ 4.0 ┆ 4.0 │\n", + "│ Existing_ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ Primary_B ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ ank… ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ Primary_B ┆ 9391 ┆ 3 ┆ P ┆ … ┆ 1 ┆ 1 ┆ 1.0 ┆ 1.0 │\n", + "│ ank_Type ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ Contacted ┆ 0 ┆ 2 ┆ Y ┆ … ┆ 1 ┆ 1 ┆ 1.0 ┆ 1.0 │\n", + "│ Source ┆ 0 ┆ 29 ┆ S122 ┆ … ┆ 4 ┆ 4 ┆ 4.0 ┆ 4.0 │\n", + "│ Source_Ca ┆ 0 ┆ 7 ┆ B ┆ … ┆ 1 ┆ 1 ┆ 1.0 ┆ 1.0 │\n", + "│ tegory ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "└───────────┴───────────┴──────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Basic stats about string columns\n", + "dia.str_stats()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Correlation, Null Correlation, Feature Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "
columnnon_null_cntnull%meanstdminq1medianq3maxIQRoutlier_cnthistogram
Employer_Category2654156.17%3.7200.8071.0004.0004.0004.0004.0000.0007833
57.6K04.26K1.96K1.62K57.6K
Monthly_Income697130.00%5,622.283174,767.0620.0001,650.0002,500.0004,000.00038,383,838.3002,350.0003920
69.7K069.7K53411
Existing_EMI696620.07%360.9292,288.5180.0000.0000.000350.000545,436.500350.0006549
69.7K069.7K511
Loan_Amount4200439.75%39,429.98330,727.5965,000.00020,000.00030,000.00050,000.000300,000.00030,000.0001547
15.0K08.65K15.0K6.42K5.98K1.61K1.03K1.72K5881556272659857294324
Loan_Period4200439.75%3.8911.1671.0003.0004.0005.0006.0002.0000
16.7K01.89K4.27K7.06K12.1K16.7K1
Interest_Rate2227668.05%19.2145.84711.99015.25018.00020.00037.0004.7502378
4.35K07582.75K4.35K2.44K1.44K2.44K3.97K15685865712184797321.43K290407240
EMI2227668.05%1,101.466752.661118.000649.000941.0001,295.00013,556.000646.0001081
9.89K08.18K9.89K2.41K9793702631064617121112
Var1697130.00%3.9483.8190.0000.0002.0007.00010.0007.0000
23.3K023.3K13.4K7.67K11.9K13.4K
Approved697130.00%0.0150.1200.0000.0000.0000.0001.0000.0001020
68.7K068.7K1.02K
\n", + "\n", + "
\n", + " " + ], + "text/plain": [ + "GT(_tbl_data=shape: (9, 13)\n", + "┌────────────┬────────────┬──────────┬────────────┬───┬──────────┬─────────┬───────────┬───────────┐\n", + "│ column ┆ non_null_c ┆ null% ┆ mean ┆ … ┆ max ┆ IQR ┆ outlier_c ┆ histogram │\n", + "│ --- ┆ nt ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ nt ┆ --- │\n", + "│ str ┆ --- ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ --- ┆ struct[1] │\n", + "│ ┆ u32 ┆ ┆ ┆ ┆ ┆ ┆ u32 ┆ │\n", + "╞════════════╪════════════╪══════════╪════════════╪═══╪══════════╪═════════╪═══════════╪═══════════╡\n", + "│ Employer_C ┆ 65415 ┆ 0.061653 ┆ 3.720187 ┆ … ┆ 4.0 ┆ 0.0 ┆ 7833 ┆ {[4258, │\n", + "│ ategory2 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 1955, … │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 57582]} │\n", + "│ Monthly_In ┆ 69713 ┆ 0.0 ┆ 5622.2832 ┆ … ┆ 3.8384e7 ┆ 2350.0 ┆ 3920 ┆ {[69699, │\n", + "│ come ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 5, … 1]} │\n", + "│ Existing_E ┆ 69662 ┆ 0.000732 ┆ 360.928751 ┆ … ┆ 545436.5 ┆ 350.0 ┆ 6549 ┆ {[69655, │\n", + "│ MI ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 5, … 1]} │\n", + "│ Loan_Amoun ┆ 42004 ┆ 0.397472 ┆ 39429.9828 ┆ … ┆ 300000.0 ┆ 30000.0 ┆ 1547 ┆ {[8646, │\n", + "│ t ┆ ┆ ┆ 59 ┆ ┆ ┆ ┆ ┆ 15019, … │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 4]} │\n", + "│ Loan_Perio ┆ 42004 ┆ 0.397472 ┆ 3.890629 ┆ … ┆ 6.0 ┆ 2.0 ┆ 0 ┆ {[1886, │\n", + "│ d ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 4266, … │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 1]} │\n", + "│ Interest_R ┆ 22276 ┆ 0.680461 ┆ 19.21357 ┆ … ┆ 37.0 ┆ 4.75 ┆ 2378 ┆ {[758, │\n", + "│ ate ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2752, … │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 240]} │\n", + "│ EMI ┆ 22276 ┆ 0.680461 ┆ 1101.46624 ┆ … ┆ 13556.0 ┆ 646.0 ┆ 1081 ┆ {[8179, │\n", + "│ ┆ ┆ ┆ 2 ┆ ┆ ┆ ┆ ┆ 9889, … │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2]} │\n", + "│ Var1 ┆ 69713 ┆ 0.0 ┆ 3.948446 ┆ … ┆ 10.0 ┆ 7.0 ┆ 0 ┆ {[23308, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 13363, … │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 13420]} │\n", + "│ Approved ┆ 69713 ┆ 0.0 ┆ 0.014631 ┆ … ┆ 1.0 ┆ 0.0 ┆ 1020 ┆ {[68693, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 1020]} │\n", + "└────────────┴────────────┴──────────┴────────────┴───┴──────────┴─────────┴───────────┴───────────┘, _body=, _boxhead=Boxhead([ColInfo(var='column', type=, column_label='column', column_align='left', column_width=None), ColInfo(var='non_null_cnt', type=, column_label='non_null_cnt', column_align='center', column_width=None), ColInfo(var='null%', type=, column_label='null%', column_align='right', column_width=None), ColInfo(var='mean', type=, column_label='mean', column_align='right', column_width=None), ColInfo(var='std', type=, column_label='std', column_align='right', column_width=None), ColInfo(var='min', type=, column_label='min', column_align='right', column_width=None), ColInfo(var='q1', type=, column_label='q1', column_align='right', column_width=None), ColInfo(var='median', type=, column_label='median', column_align='right', column_width=None), ColInfo(var='q3', type=, column_label='q3', column_align='right', column_width=None), ColInfo(var='max', type=, column_label='max', column_align='right', column_width=None), ColInfo(var='IQR', type=, column_label='IQR', column_align='right', column_width=None), ColInfo(var='outlier_cnt', type=, column_label='outlier_cnt', column_align='center', column_width=None), ColInfo(var='histogram', type=, column_label='histogram', column_align='center', column_width=None)]), _stub=, _spanners=Spanners([]), _heading=Heading(title=None, subtitle=None, preheader=None), _stubhead='column', _source_notes=[], _footnotes=[], _styles=[], _locale=, _formats=[, , ], _substitutions=[], _options=Options(table_id=OptionsInfo(scss=False, category='table', type='value', value=None), table_caption=OptionsInfo(scss=False, category='table', type='value', value=None), table_width=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_layout=OptionsInfo(scss=True, category='table', type='value', value='fixed'), table_margin_left=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_margin_right=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_background_color=OptionsInfo(scss=True, category='table', type='value', value='#FFFFFF'), table_additional_css=OptionsInfo(scss=False, category='table', type='values', value=[]), table_font_names=OptionsInfo(scss=False, category='table', type='values', value=['-apple-system', 'BlinkMacSystemFont', 'Segoe UI', 'Roboto', 'Oxygen', 'Ubuntu', 'Cantarell', 'Helvetica Neue', 'Fira Sans', 'Droid Sans', 'Arial', 'sans-serif']), table_font_size=OptionsInfo(scss=True, category='table', type='px', value='16px'), table_font_weight=OptionsInfo(scss=True, category='table', type='value', value='normal'), table_font_style=OptionsInfo(scss=True, category='table', type='value', value='normal'), table_font_color=OptionsInfo(scss=True, category='table', type='value', value='#333333'), table_font_color_light=OptionsInfo(scss=True, category='table', type='value', value='#FFFFFF'), table_border_top_include=OptionsInfo(scss=False, category='table', type='boolean', value=True), table_border_top_style=OptionsInfo(scss=True, category='table', type='value', value='solid'), table_border_top_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_top_color=OptionsInfo(scss=True, category='table', type='value', value='#A8A8A8'), table_border_right_style=OptionsInfo(scss=True, category='table', type='value', value='none'), table_border_right_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_right_color=OptionsInfo(scss=True, category='table', type='value', value='#D3D3D3'), table_border_bottom_include=OptionsInfo(scss=False, category='table', type='boolean', value=True), table_border_bottom_style=OptionsInfo(scss=True, category='table', type='value', value='solid'), table_border_bottom_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_bottom_color=OptionsInfo(scss=True, category='table', type='value', value='#A8A8A8'), table_border_left_style=OptionsInfo(scss=True, category='table', type='value', value='none'), table_border_left_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_left_color=OptionsInfo(scss=True, category='table', type='value', value='#D3D3D3'), heading_background_color=OptionsInfo(scss=True, category='heading', type='value', value=None), heading_align=OptionsInfo(scss=True, category='heading', type='value', value='center'), heading_title_font_size=OptionsInfo(scss=True, category='heading', type='px', value='125%'), heading_title_font_weight=OptionsInfo(scss=True, category='heading', type='value', value='initial'), heading_subtitle_font_size=OptionsInfo(scss=True, category='heading', type='px', value='85%'), heading_subtitle_font_weight=OptionsInfo(scss=True, category='heading', type='value', value='initial'), heading_padding=OptionsInfo(scss=True, category='heading', type='px', value='4px'), heading_padding_horizontal=OptionsInfo(scss=True, category='heading', type='px', value='5px'), heading_border_bottom_style=OptionsInfo(scss=True, category='heading', type='value', value='solid'), heading_border_bottom_width=OptionsInfo(scss=True, category='heading', type='px', value='2px'), heading_border_bottom_color=OptionsInfo(scss=True, category='heading', type='value', value='#D3D3D3'), heading_border_lr_style=OptionsInfo(scss=True, category='heading', type='value', value='none'), heading_border_lr_width=OptionsInfo(scss=True, category='heading', type='px', value='1px'), heading_border_lr_color=OptionsInfo(scss=True, category='heading', type='value', value='#D3D3D3'), column_labels_background_color=OptionsInfo(scss=True, category='column_labels', type='value', value=None), column_labels_font_size=OptionsInfo(scss=True, category='column_labels', type='px', value='100%'), column_labels_font_weight=OptionsInfo(scss=True, category='column_labels', type='value', value='normal'), column_labels_text_transform=OptionsInfo(scss=True, category='column_labels', type='value', value='inherit'), column_labels_padding=OptionsInfo(scss=True, category='column_labels', type='px', value='5px'), column_labels_padding_horizontal=OptionsInfo(scss=True, category='column_labels', type='px', value='5px'), column_labels_vlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='none'), column_labels_vlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), column_labels_vlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), column_labels_border_top_style=OptionsInfo(scss=True, category='column_labels', type='value', value='solid'), column_labels_border_top_width=OptionsInfo(scss=True, category='column_labels', type='px', value='2px'), column_labels_border_top_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_border_bottom_style=OptionsInfo(scss=True, category='column_labels', type='value', value='solid'), column_labels_border_bottom_width=OptionsInfo(scss=True, category='column_labels', type='px', value='2px'), column_labels_border_bottom_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_border_lr_style=OptionsInfo(scss=True, category='column_labels', type='value', value='none'), column_labels_border_lr_width=OptionsInfo(scss=True, category='column_labels', type='px', value='1px'), column_labels_border_lr_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_hidden=OptionsInfo(scss=False, category='column_labels', type='boolean', value=False), row_group_background_color=OptionsInfo(scss=True, category='row_group', type='value', value=None), row_group_font_size=OptionsInfo(scss=True, category='row_group', type='px', value='100%'), row_group_font_weight=OptionsInfo(scss=True, category='row_group', type='value', value='initial'), row_group_text_transform=OptionsInfo(scss=True, category='row_group', type='value', value='inherit'), row_group_padding=OptionsInfo(scss=True, category='row_group', type='px', value='8px'), row_group_padding_horizontal=OptionsInfo(scss=True, category='row_group', type='px', value='5px'), row_group_border_top_style=OptionsInfo(scss=True, category='row_group', type='value', value='solid'), row_group_border_top_width=OptionsInfo(scss=True, category='row_group', type='px', value='2px'), row_group_border_top_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_right_style=OptionsInfo(scss=True, category='row_group', type='value', value='none'), row_group_border_right_width=OptionsInfo(scss=True, category='row_group', type='px', value='1px'), row_group_border_right_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_bottom_style=OptionsInfo(scss=True, category='row_group', type='value', value='solid'), row_group_border_bottom_width=OptionsInfo(scss=True, category='row_group', type='px', value='2px'), row_group_border_bottom_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_left_style=OptionsInfo(scss=True, category='row_group', type='value', value='none'), row_group_border_left_width=OptionsInfo(scss=True, category='row_group', type='px', value='1px'), row_group_border_left_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_as_column=OptionsInfo(scss=False, category='row_group', type='boolean', value=False), table_body_hlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_hlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), table_body_hlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_vlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='none'), table_body_vlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), table_body_vlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_border_top_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_border_top_width=OptionsInfo(scss=True, category='table_body', type='px', value='2px'), table_body_border_top_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_border_bottom_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_border_bottom_width=OptionsInfo(scss=True, category='table_body', type='px', value='2px'), table_body_border_bottom_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), data_row_padding=OptionsInfo(scss=True, category='data_row', type='px', value='8px'), data_row_padding_horizontal=OptionsInfo(scss=True, category='data_row', type='px', value='5px'), stub_background_color=OptionsInfo(scss=True, category='stub', type='value', value=None), stub_font_size=OptionsInfo(scss=True, category='stub', type='px', value='100%'), stub_font_weight=OptionsInfo(scss=True, category='stub', type='value', value='initial'), stub_text_transform=OptionsInfo(scss=True, category='stub', type='value', value='inherit'), stub_border_style=OptionsInfo(scss=True, category='stub', type='value', value='solid'), stub_border_width=OptionsInfo(scss=True, category='stub', type='px', value='2px'), stub_border_color=OptionsInfo(scss=True, category='stub', type='value', value='#D3D3D3'), stub_row_group_background_color=OptionsInfo(scss=True, category='stub', type='value', value=None), stub_row_group_font_size=OptionsInfo(scss=True, category='stub', type='px', value='100%'), stub_row_group_font_weight=OptionsInfo(scss=True, category='stub', type='value', value='initial'), stub_row_group_text_transform=OptionsInfo(scss=True, category='stub', type='value', value='inherit'), stub_row_group_border_style=OptionsInfo(scss=True, category='stub', type='value', value='solid'), stub_row_group_border_width=OptionsInfo(scss=True, category='stub', type='px', value='2px'), stub_row_group_border_color=OptionsInfo(scss=True, category='stub', type='value', value='#D3D3D3'), source_notes_padding=OptionsInfo(scss=True, category='source_notes', type='px', value='4px'), source_notes_padding_horizontal=OptionsInfo(scss=True, category='source_notes', type='px', value='5px'), source_notes_background_color=OptionsInfo(scss=True, category='source_notes', type='value', value=None), source_notes_font_size=OptionsInfo(scss=True, category='source_notes', type='px', value='90%'), source_notes_border_bottom_style=OptionsInfo(scss=True, category='source_notes', type='value', value='none'), source_notes_border_bottom_width=OptionsInfo(scss=True, category='source_notes', type='px', value='2px'), source_notes_border_bottom_color=OptionsInfo(scss=True, category='source_notes', type='value', value='#D3D3D3'), source_notes_border_lr_style=OptionsInfo(scss=True, category='source_notes', type='value', value='none'), source_notes_border_lr_width=OptionsInfo(scss=True, category='source_notes', type='px', value='2px'), source_notes_border_lr_color=OptionsInfo(scss=True, category='source_notes', type='value', value='#D3D3D3'), source_notes_multiline=OptionsInfo(scss=False, category='source_notes', type='boolean', value=True), source_notes_sep=OptionsInfo(scss=False, category='source_notes', type='value', value=' '), row_striping_background_color=OptionsInfo(scss=True, category='row', type='value', value='rgba(128,128,128,0.05)'), row_striping_include_stub=OptionsInfo(scss=False, category='row', type='boolean', value=False), row_striping_include_table_body=OptionsInfo(scss=False, category='row', type='boolean', value=False), container_width=OptionsInfo(scss=False, category='container', type='px', value='auto'), container_height=OptionsInfo(scss=False, category='container', type='px', value='auto'), container_padding_x=OptionsInfo(scss=False, category='container', type='px', value='0px'), container_padding_y=OptionsInfo(scss=False, category='container', type='px', value='10px'), container_overflow_x=OptionsInfo(scss=False, category='container', type='overflow', value='auto'), container_overflow_y=OptionsInfo(scss=False, category='container', type='overflow', value='auto'), quarto_disable_processing=OptionsInfo(scss=False, category='quarto', type='logical', value=False), quarto_use_bootstrap=OptionsInfo(scss=False, category='quarto', type='logical', value=False)), _has_built=False)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pl.read_parquet(\"dependency.parquet\")\n", + "df.head()\n", + "\n", + "dia = DIA(df)\n", + "dia.numeric_profile(iqr_multiplier=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 22)
IDGenderDOBLead_Creation_DateCity_CodeCity_CategoryEmployer_CodeEmployer_Category1Employer_Category2Monthly_IncomeCustomer_Existing_Primary_Bank_CodePrimary_Bank_TypeContactedSourceSource_CategoryExisting_EMILoan_AmountLoan_PeriodInterest_RateEMIVar1Approved
strstrstrstrstrstrstrstri64f64strstrstrstrstrf64i64i64f64i64i64i64
"APPC90493171225""Female""23/07/79""15/07/16""C10001""A""COM0044082""A"42000.0"B001""P""N""S122""G"0.0nullnullnullnull00
"APPD40611263344""Male""07/12/86""04/07/16""C10003""A""COM0000002""C"13500.0"B002""P""Y""S122""G"0.020000213.25953100
"APPE70289249423""Male""10/12/82""19/07/16""C10125""C""COM0005267""C"42250.0"B003""G""Y""S143""B"0.0450004nullnull00
"APPF80273865537""Male""30/01/89""09/07/16""C10477""C""COM0004143""A"43500.0"B003""G""Y""S143""B"0.0920005nullnull70
"APPG60994436641""Male""19/04/85""20/07/16""C10002""A""COM0001781""A"410000.0"B001""P""Y""S134""B"2500.0500002nullnull100
" + ], + "text/plain": [ + "shape: (5, 22)\n", + "┌────────────────┬────────┬──────────┬────────────────┬───┬───────────────┬──────┬──────┬──────────┐\n", + "│ ID ┆ Gender ┆ DOB ┆ Lead_Creation_ ┆ … ┆ Interest_Rate ┆ EMI ┆ Var1 ┆ Approved │\n", + "│ --- ┆ --- ┆ --- ┆ Date ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ str ┆ --- ┆ ┆ f64 ┆ i64 ┆ i64 ┆ i64 │\n", + "│ ┆ ┆ ┆ str ┆ ┆ ┆ ┆ ┆ │\n", + "╞════════════════╪════════╪══════════╪════════════════╪═══╪═══════════════╪══════╪══════╪══════════╡\n", + "│ APPC9049317122 ┆ Female ┆ 23/07/79 ┆ 15/07/16 ┆ … ┆ null ┆ null ┆ 0 ┆ 0 │\n", + "│ 5 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ APPD4061126334 ┆ Male ┆ 07/12/86 ┆ 04/07/16 ┆ … ┆ 13.25 ┆ 953 ┆ 10 ┆ 0 │\n", + "│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ APPE7028924942 ┆ Male ┆ 10/12/82 ┆ 19/07/16 ┆ … ┆ null ┆ null ┆ 0 ┆ 0 │\n", + "│ 3 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ APPF8027386553 ┆ Male ┆ 30/01/89 ┆ 09/07/16 ┆ … ┆ null ┆ null ┆ 7 ┆ 0 │\n", + "│ 7 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ APPG6099443664 ┆ Male ┆ 19/04/85 ┆ 20/07/16 ┆ … ┆ null ┆ null ┆ 10 ┆ 0 │\n", + "│ 1 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "└────────────────┴────────┴──────────┴────────────────┴───┴───────────────┴──────┴──────┴──────────┘" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (2, 10)
columnEmployer_Category2Monthly_IncomeExisting_EMILoan_AmountLoan_PeriodInterest_RateEMIVar1Approved
strf64f64f64f64f64f64f64f64f64
"Monthly_Income"0.0015461.00.2458260.039998-0.003671-0.0147890.0351630.0248540.000472
"Existing_EMI"-0.0170740.2458261.00.008653-0.004603-0.0230010.0008130.006620.027821
" + ], + "text/plain": [ + "shape: (2, 10)\n", + "┌────────────┬────────────┬───────────┬───────────┬───┬───────────┬──────────┬──────────┬──────────┐\n", + "│ column ┆ Employer_C ┆ Monthly_I ┆ Existing_ ┆ … ┆ Interest_ ┆ EMI ┆ Var1 ┆ Approved │\n", + "│ --- ┆ ategory2 ┆ ncome ┆ EMI ┆ ┆ Rate ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ f64 ┆ f64 ┆ f64 │\n", + "│ ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ ┆ ┆ │\n", + "╞════════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪══════════╪══════════╪══════════╡\n", + "│ Monthly_In ┆ 0.001546 ┆ 1.0 ┆ 0.245826 ┆ … ┆ -0.014789 ┆ 0.035163 ┆ 0.024854 ┆ 0.000472 │\n", + "│ come ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ Existing_E ┆ -0.017074 ┆ 0.245826 ┆ 1.0 ┆ … ┆ -0.023001 ┆ 0.000813 ┆ 0.00662 ┆ 0.027821 │\n", + "│ MI ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "└────────────┴────────────┴───────────┴───────────┴───┴───────────┴──────────┴──────────┴──────────┘" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import polars.selectors as cs\n", + "\n", + "dia.corr(subset=[\"Monthly_Income\", \"Existing_EMI\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (22, 10)
columnEmployer_Category2Monthly_IncomeExisting_EMILoan_AmountLoan_PeriodInterest_RateEMIVar1Approved
strf64f64f64f64f64f64f64f64f64
"ID"-0.0013440.002872-0.0018460.002515-0.004610.0039370.0001440.001832-0.002095
"Gender"-0.0419010.198756-0.143366-0.0123850.04055-0.0108010.0104270.53310.045283
"DOB"-0.0054170.005234-0.0025140.0048040.0061140.0004920.005204-0.000032-0.002547
"Lead_Creation_Date"0.0028680.0066150.009870.001086-0.0793010.014667-0.0147680.039963-0.005199
"City_Code"0.065158-0.0920070.039832-0.0247160.060910.1445210.0597810.030293-0.028195
"Loan_Period"-0.0156740.020818-0.0741730.4916371.0-0.0595340.145961-0.009532-0.000028
"Interest_Rate"0.242253-0.662215-0.040708-0.3561-0.0595341.0-0.284007-0.620129-0.12408
"EMI"-0.0686920.480323-0.2961460.8896930.145961-0.2840071.00.29340.040533
"Var1"-0.1143330.67349-0.0154270.296295-0.009532-0.6201290.29341.00.103802
"Approved"-0.020020.1201550.0566690.042231-0.000028-0.124080.0405330.1038021.0
" + ], + "text/plain": [ + "shape: (22, 10)\n", + "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", + "│ column ┆ Employer_ ┆ Monthly_I ┆ Existing_ ┆ … ┆ Interest_ ┆ EMI ┆ Var1 ┆ Approved │\n", + "│ --- ┆ Category2 ┆ ncome ┆ EMI ┆ ┆ Rate ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ f64 ┆ f64 ┆ f64 │\n", + "│ ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ ┆ ┆ │\n", + "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", + "│ ID ┆ -0.001344 ┆ 0.002872 ┆ -0.001846 ┆ … ┆ 0.003937 ┆ 0.000144 ┆ 0.001832 ┆ -0.00209 │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 5 │\n", + "│ Gender ┆ -0.041901 ┆ 0.198756 ┆ -0.143366 ┆ … ┆ -0.010801 ┆ 0.010427 ┆ 0.5331 ┆ 0.045283 │\n", + "│ DOB ┆ -0.005417 ┆ 0.005234 ┆ -0.002514 ┆ … ┆ 0.000492 ┆ 0.005204 ┆ -0.000032 ┆ -0.00254 │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 7 │\n", + "│ Lead_Crea ┆ 0.002868 ┆ 0.006615 ┆ 0.00987 ┆ … ┆ 0.014667 ┆ -0.014768 ┆ 0.039963 ┆ -0.00519 │\n", + "│ tion_Date ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 9 │\n", + "│ City_Code ┆ 0.065158 ┆ -0.092007 ┆ 0.039832 ┆ … ┆ 0.144521 ┆ 0.059781 ┆ 0.030293 ┆ -0.02819 │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 5 │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ Loan_Peri ┆ -0.015674 ┆ 0.020818 ┆ -0.074173 ┆ … ┆ -0.059534 ┆ 0.145961 ┆ -0.009532 ┆ -0.00002 │\n", + "│ od ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 8 │\n", + "│ Interest_ ┆ 0.242253 ┆ -0.662215 ┆ -0.040708 ┆ … ┆ 1.0 ┆ -0.284007 ┆ -0.620129 ┆ -0.12408 │\n", + "│ Rate ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ EMI ┆ -0.068692 ┆ 0.480323 ┆ -0.296146 ┆ … ┆ -0.284007 ┆ 1.0 ┆ 0.2934 ┆ 0.040533 │\n", + "│ Var1 ┆ -0.114333 ┆ 0.67349 ┆ -0.015427 ┆ … ┆ -0.620129 ┆ 0.2934 ┆ 1.0 ┆ 0.103802 │\n", + "│ Approved ┆ -0.02002 ┆ 0.120155 ┆ 0.056669 ┆ … ┆ -0.12408 ┆ 0.040533 ┆ 0.103802 ┆ 1.0 │\n", + "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dia.corr(subset=cs.all(), method=\"spearman\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "
columnEmployer_Category2Monthly_IncomeExisting_EMILoan_AmountLoan_PeriodInterest_RateEMIVar1Approved
Monthly_Income−0.1101.0000.1710.4810.021−0.6620.4800.6730.120
Existing_EMI0.0350.1711.000−0.225−0.074−0.041−0.296−0.0150.057
\n", + "\n", + "
\n", + " " + ], + "text/plain": [ + "GT(_tbl_data=shape: (2, 10)\n", + "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", + "│ column ┆ Employer_ ┆ Monthly_I ┆ Existing_ ┆ … ┆ Interest_ ┆ EMI ┆ Var1 ┆ Approved │\n", + "│ --- ┆ Category2 ┆ ncome ┆ EMI ┆ ┆ Rate ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ f64 ┆ f64 ┆ f64 │\n", + "│ ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ ┆ ┆ │\n", + "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", + "│ Monthly_I ┆ -0.109954 ┆ 1.0 ┆ 0.170825 ┆ … ┆ -0.662215 ┆ 0.480323 ┆ 0.67349 ┆ 0.120155 │\n", + "│ ncome ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ Existing_ ┆ 0.034867 ┆ 0.170825 ┆ 1.0 ┆ … ┆ -0.040708 ┆ -0.296146 ┆ -0.015427 ┆ 0.056669 │\n", + "│ EMI ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘, _body=, _boxhead=Boxhead([ColInfo(var='column', type=, column_label='column', column_align='left', column_width=None), ColInfo(var='Employer_Category2', type=, column_label='Employer_Category2', column_align='right', column_width=None), ColInfo(var='Monthly_Income', type=, column_label='Monthly_Income', column_align='right', column_width=None), ColInfo(var='Existing_EMI', type=, column_label='Existing_EMI', column_align='right', column_width=None), ColInfo(var='Loan_Amount', type=, column_label='Loan_Amount', column_align='right', column_width=None), ColInfo(var='Loan_Period', type=, column_label='Loan_Period', column_align='right', column_width=None), ColInfo(var='Interest_Rate', type=, column_label='Interest_Rate', column_align='right', column_width=None), ColInfo(var='EMI', type=, column_label='EMI', column_align='right', column_width=None), ColInfo(var='Var1', type=, column_label='Var1', column_align='right', column_width=None), ColInfo(var='Approved', type=, column_label='Approved', column_align='right', column_width=None)]), _stub=, _spanners=Spanners([]), _heading=Heading(title=None, subtitle=None, preheader=None), _stubhead=None, _source_notes=[], _footnotes=[], _styles=[StyleInfo(locname=LocBody(columns='Employer_Category2', rows=[0]), grpname=None, colname='Employer_Category2', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#550281')]), StyleInfo(locname=LocBody(columns='Employer_Category2', rows=[1]), grpname=None, colname='Employer_Category2', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#630278')]), StyleInfo(locname=LocBody(columns='Monthly_Income', rows=[0]), grpname=None, colname='Monthly_Income', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#bd0237')]), StyleInfo(locname=LocBody(columns='Monthly_Income', rows=[1]), grpname=None, colname='Monthly_Income', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#6f026f')]), StyleInfo(locname=LocBody(columns='Existing_EMI', rows=[0]), grpname=None, colname='Existing_EMI', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#6f026f')]), StyleInfo(locname=LocBody(columns='Existing_EMI', rows=[1]), grpname=None, colname='Existing_EMI', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#bd0237')]), StyleInfo(locname=LocBody(columns='Loan_Amount', rows=[0]), grpname=None, colname='Loan_Amount', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#8c025a')]), StyleInfo(locname=LocBody(columns='Loan_Amount', rows=[1]), grpname=None, colname='Loan_Amount', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#4a0289')]), StyleInfo(locname=LocBody(columns='Loan_Period', rows=[0]), grpname=None, colname='Loan_Period', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#610279')]), StyleInfo(locname=LocBody(columns='Loan_Period', rows=[1]), grpname=None, colname='Loan_Period', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#59027f')]), StyleInfo(locname=LocBody(columns='Interest_Rate', rows=[0]), grpname=None, colname='Interest_Rate', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#2202a6')]), StyleInfo(locname=LocBody(columns='Interest_Rate', rows=[1]), grpname=None, colname='Interest_Rate', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#5c027d')]), StyleInfo(locname=LocBody(columns='EMI', rows=[0]), grpname=None, colname='EMI', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#8c025a')]), StyleInfo(locname=LocBody(columns='EMI', rows=[1]), grpname=None, colname='EMI', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#44028e')]), StyleInfo(locname=LocBody(columns='Var1', rows=[0]), grpname=None, colname='Var1', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#9e024d')]), StyleInfo(locname=LocBody(columns='Var1', rows=[1]), grpname=None, colname='Var1', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#5e027b')]), StyleInfo(locname=LocBody(columns='Approved', rows=[0]), grpname=None, colname='Approved', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#6b0272')]), StyleInfo(locname=LocBody(columns='Approved', rows=[1]), grpname=None, colname='Approved', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#650276')])], _locale=, _formats=[], _substitutions=[], _options=Options(table_id=OptionsInfo(scss=False, category='table', type='value', value=None), table_caption=OptionsInfo(scss=False, category='table', type='value', value=None), table_width=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_layout=OptionsInfo(scss=True, category='table', type='value', value='fixed'), table_margin_left=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_margin_right=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_background_color=OptionsInfo(scss=True, category='table', type='value', value='#FFFFFF'), table_additional_css=OptionsInfo(scss=False, category='table', type='values', value=[]), table_font_names=OptionsInfo(scss=False, category='table', type='values', value=['-apple-system', 'BlinkMacSystemFont', 'Segoe UI', 'Roboto', 'Oxygen', 'Ubuntu', 'Cantarell', 'Helvetica Neue', 'Fira Sans', 'Droid Sans', 'Arial', 'sans-serif']), table_font_size=OptionsInfo(scss=True, category='table', type='px', value='16px'), table_font_weight=OptionsInfo(scss=True, category='table', type='value', value='normal'), table_font_style=OptionsInfo(scss=True, category='table', type='value', value='normal'), table_font_color=OptionsInfo(scss=True, category='table', type='value', value='#333333'), table_font_color_light=OptionsInfo(scss=True, category='table', type='value', value='#FFFFFF'), table_border_top_include=OptionsInfo(scss=False, category='table', type='boolean', value=True), table_border_top_style=OptionsInfo(scss=True, category='table', type='value', value='solid'), table_border_top_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_top_color=OptionsInfo(scss=True, category='table', type='value', value='#A8A8A8'), table_border_right_style=OptionsInfo(scss=True, category='table', type='value', value='none'), table_border_right_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_right_color=OptionsInfo(scss=True, category='table', type='value', value='#D3D3D3'), table_border_bottom_include=OptionsInfo(scss=False, category='table', type='boolean', value=True), table_border_bottom_style=OptionsInfo(scss=True, category='table', type='value', value='solid'), table_border_bottom_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_bottom_color=OptionsInfo(scss=True, category='table', type='value', value='#A8A8A8'), table_border_left_style=OptionsInfo(scss=True, category='table', type='value', value='none'), table_border_left_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_left_color=OptionsInfo(scss=True, category='table', type='value', value='#D3D3D3'), heading_background_color=OptionsInfo(scss=True, category='heading', type='value', value=None), heading_align=OptionsInfo(scss=True, category='heading', type='value', value='center'), heading_title_font_size=OptionsInfo(scss=True, category='heading', type='px', value='125%'), heading_title_font_weight=OptionsInfo(scss=True, category='heading', type='value', value='initial'), heading_subtitle_font_size=OptionsInfo(scss=True, category='heading', type='px', value='85%'), heading_subtitle_font_weight=OptionsInfo(scss=True, category='heading', type='value', value='initial'), heading_padding=OptionsInfo(scss=True, category='heading', type='px', value='4px'), heading_padding_horizontal=OptionsInfo(scss=True, category='heading', type='px', value='5px'), heading_border_bottom_style=OptionsInfo(scss=True, category='heading', type='value', value='solid'), heading_border_bottom_width=OptionsInfo(scss=True, category='heading', type='px', value='2px'), heading_border_bottom_color=OptionsInfo(scss=True, category='heading', type='value', value='#D3D3D3'), heading_border_lr_style=OptionsInfo(scss=True, category='heading', type='value', value='none'), heading_border_lr_width=OptionsInfo(scss=True, category='heading', type='px', value='1px'), heading_border_lr_color=OptionsInfo(scss=True, category='heading', type='value', value='#D3D3D3'), column_labels_background_color=OptionsInfo(scss=True, category='column_labels', type='value', value=None), column_labels_font_size=OptionsInfo(scss=True, category='column_labels', type='px', value='100%'), column_labels_font_weight=OptionsInfo(scss=True, category='column_labels', type='value', value='normal'), column_labels_text_transform=OptionsInfo(scss=True, category='column_labels', type='value', value='inherit'), column_labels_padding=OptionsInfo(scss=True, category='column_labels', type='px', value='5px'), column_labels_padding_horizontal=OptionsInfo(scss=True, category='column_labels', type='px', value='5px'), column_labels_vlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='none'), column_labels_vlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), column_labels_vlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), column_labels_border_top_style=OptionsInfo(scss=True, category='column_labels', type='value', value='solid'), column_labels_border_top_width=OptionsInfo(scss=True, category='column_labels', type='px', value='2px'), column_labels_border_top_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_border_bottom_style=OptionsInfo(scss=True, category='column_labels', type='value', value='solid'), column_labels_border_bottom_width=OptionsInfo(scss=True, category='column_labels', type='px', value='2px'), column_labels_border_bottom_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_border_lr_style=OptionsInfo(scss=True, category='column_labels', type='value', value='none'), column_labels_border_lr_width=OptionsInfo(scss=True, category='column_labels', type='px', value='1px'), column_labels_border_lr_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_hidden=OptionsInfo(scss=False, category='column_labels', type='boolean', value=False), row_group_background_color=OptionsInfo(scss=True, category='row_group', type='value', value=None), row_group_font_size=OptionsInfo(scss=True, category='row_group', type='px', value='100%'), row_group_font_weight=OptionsInfo(scss=True, category='row_group', type='value', value='initial'), row_group_text_transform=OptionsInfo(scss=True, category='row_group', type='value', value='inherit'), row_group_padding=OptionsInfo(scss=True, category='row_group', type='px', value='8px'), row_group_padding_horizontal=OptionsInfo(scss=True, category='row_group', type='px', value='5px'), row_group_border_top_style=OptionsInfo(scss=True, category='row_group', type='value', value='solid'), row_group_border_top_width=OptionsInfo(scss=True, category='row_group', type='px', value='2px'), row_group_border_top_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_right_style=OptionsInfo(scss=True, category='row_group', type='value', value='none'), row_group_border_right_width=OptionsInfo(scss=True, category='row_group', type='px', value='1px'), row_group_border_right_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_bottom_style=OptionsInfo(scss=True, category='row_group', type='value', value='solid'), row_group_border_bottom_width=OptionsInfo(scss=True, category='row_group', type='px', value='2px'), row_group_border_bottom_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_left_style=OptionsInfo(scss=True, category='row_group', type='value', value='none'), row_group_border_left_width=OptionsInfo(scss=True, category='row_group', type='px', value='1px'), row_group_border_left_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_as_column=OptionsInfo(scss=False, category='row_group', type='boolean', value=False), table_body_hlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_hlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), table_body_hlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_vlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='none'), table_body_vlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), table_body_vlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_border_top_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_border_top_width=OptionsInfo(scss=True, category='table_body', type='px', value='2px'), table_body_border_top_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_border_bottom_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_border_bottom_width=OptionsInfo(scss=True, category='table_body', type='px', value='2px'), table_body_border_bottom_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), data_row_padding=OptionsInfo(scss=True, category='data_row', type='px', value='8px'), data_row_padding_horizontal=OptionsInfo(scss=True, category='data_row', type='px', value='5px'), stub_background_color=OptionsInfo(scss=True, category='stub', type='value', value=None), stub_font_size=OptionsInfo(scss=True, category='stub', type='px', value='100%'), stub_font_weight=OptionsInfo(scss=True, category='stub', type='value', value='initial'), stub_text_transform=OptionsInfo(scss=True, category='stub', type='value', value='inherit'), stub_border_style=OptionsInfo(scss=True, category='stub', type='value', value='solid'), stub_border_width=OptionsInfo(scss=True, category='stub', type='px', value='2px'), stub_border_color=OptionsInfo(scss=True, category='stub', type='value', value='#D3D3D3'), stub_row_group_background_color=OptionsInfo(scss=True, category='stub', type='value', value=None), stub_row_group_font_size=OptionsInfo(scss=True, category='stub', type='px', value='100%'), stub_row_group_font_weight=OptionsInfo(scss=True, category='stub', type='value', value='initial'), stub_row_group_text_transform=OptionsInfo(scss=True, category='stub', type='value', value='inherit'), stub_row_group_border_style=OptionsInfo(scss=True, category='stub', type='value', value='solid'), stub_row_group_border_width=OptionsInfo(scss=True, category='stub', type='px', value='2px'), stub_row_group_border_color=OptionsInfo(scss=True, category='stub', type='value', value='#D3D3D3'), source_notes_padding=OptionsInfo(scss=True, category='source_notes', type='px', value='4px'), source_notes_padding_horizontal=OptionsInfo(scss=True, category='source_notes', type='px', value='5px'), source_notes_background_color=OptionsInfo(scss=True, category='source_notes', type='value', value=None), source_notes_font_size=OptionsInfo(scss=True, category='source_notes', type='px', value='90%'), source_notes_border_bottom_style=OptionsInfo(scss=True, category='source_notes', type='value', value='none'), source_notes_border_bottom_width=OptionsInfo(scss=True, category='source_notes', type='px', value='2px'), source_notes_border_bottom_color=OptionsInfo(scss=True, category='source_notes', type='value', value='#D3D3D3'), source_notes_border_lr_style=OptionsInfo(scss=True, category='source_notes', type='value', value='none'), source_notes_border_lr_width=OptionsInfo(scss=True, category='source_notes', type='px', value='2px'), source_notes_border_lr_color=OptionsInfo(scss=True, category='source_notes', type='value', value='#D3D3D3'), source_notes_multiline=OptionsInfo(scss=False, category='source_notes', type='boolean', value=True), source_notes_sep=OptionsInfo(scss=False, category='source_notes', type='value', value=' '), row_striping_background_color=OptionsInfo(scss=True, category='row', type='value', value='rgba(128,128,128,0.05)'), row_striping_include_stub=OptionsInfo(scss=False, category='row', type='boolean', value=False), row_striping_include_table_body=OptionsInfo(scss=False, category='row', type='boolean', value=False), container_width=OptionsInfo(scss=False, category='container', type='px', value='auto'), container_height=OptionsInfo(scss=False, category='container', type='px', value='auto'), container_padding_x=OptionsInfo(scss=False, category='container', type='px', value='0px'), container_padding_y=OptionsInfo(scss=False, category='container', type='px', value='10px'), container_overflow_x=OptionsInfo(scss=False, category='container', type='overflow', value='auto'), container_overflow_y=OptionsInfo(scss=False, category='container', type='overflow', value='auto'), quarto_disable_processing=OptionsInfo(scss=False, category='quarto', type='logical', value=False), quarto_use_bootstrap=OptionsInfo(scss=False, category='quarto', type='logical', value=False)), _has_built=False)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dia.plot_corr(subset=[\"Monthly_Income\", \"Existing_EMI\"], method=\"spearman\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "
columnEmployer_Category2Monthly_IncomeExisting_EMILoan_AmountLoan_PeriodInterest_RateEMIVar1Approved
Monthly_Income0.3791.0000.1120.7080.6550.8950.8700.4850.017
Existing_EMI0.3930.1181.0000.7070.6990.8680.8750.0430.016
\n", + "\n", + "
\n", + " " + ], + "text/plain": [ + "GT(_tbl_data=shape: (2, 10)\n", + "┌────────────┬────────────┬───────────┬───────────┬───┬───────────┬──────────┬──────────┬──────────┐\n", + "│ column ┆ Employer_C ┆ Monthly_I ┆ Existing_ ┆ … ┆ Interest_ ┆ EMI ┆ Var1 ┆ Approved │\n", + "│ --- ┆ ategory2 ┆ ncome ┆ EMI ┆ ┆ Rate ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ f64 ┆ f64 ┆ f64 │\n", + "│ ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ ┆ ┆ │\n", + "╞════════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪══════════╪══════════╪══════════╡\n", + "│ Monthly_In ┆ 0.379364 ┆ 0.999957 ┆ 0.112024 ┆ … ┆ 0.894811 ┆ 0.869738 ┆ 0.485249 ┆ 0.01699 │\n", + "│ come ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ Existing_E ┆ 0.392603 ┆ 0.117722 ┆ 0.999946 ┆ … ┆ 0.868281 ┆ 0.875256 ┆ 0.043346 ┆ 0.015995 │\n", + "│ MI ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "└────────────┴────────────┴───────────┴───────────┴───┴───────────┴──────────┴──────────┴──────────┘, _body=, _boxhead=Boxhead([ColInfo(var='column', type=, column_label='column', column_align='left', column_width=None), ColInfo(var='Employer_Category2', type=, column_label='Employer_Category2', column_align='right', column_width=None), ColInfo(var='Monthly_Income', type=, column_label='Monthly_Income', column_align='right', column_width=None), ColInfo(var='Existing_EMI', type=, column_label='Existing_EMI', column_align='right', column_width=None), ColInfo(var='Loan_Amount', type=, column_label='Loan_Amount', column_align='right', column_width=None), ColInfo(var='Loan_Period', type=, column_label='Loan_Period', column_align='right', column_width=None), ColInfo(var='Interest_Rate', type=, column_label='Interest_Rate', column_align='right', column_width=None), ColInfo(var='EMI', type=, column_label='EMI', column_align='right', column_width=None), ColInfo(var='Var1', type=, column_label='Var1', column_align='right', column_width=None), ColInfo(var='Approved', type=, column_label='Approved', column_align='right', column_width=None)]), _stub=, _spanners=Spanners([]), _heading=Heading(title=None, subtitle=None, preheader=None), _stubhead=None, _source_notes=[], _footnotes=[], _styles=[StyleInfo(locname=LocBody(columns='Employer_Category2', rows=[0]), grpname=None, colname='Employer_Category2', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#830261')]), StyleInfo(locname=LocBody(columns='Employer_Category2', rows=[1]), grpname=None, colname='Employer_Category2', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#840260')]), StyleInfo(locname=LocBody(columns='Monthly_Income', rows=[0]), grpname=None, colname='Monthly_Income', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#bd0237')]), StyleInfo(locname=LocBody(columns='Monthly_Income', rows=[1]), grpname=None, colname='Monthly_Income', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#6b0272')]), StyleInfo(locname=LocBody(columns='Existing_EMI', rows=[0]), grpname=None, colname='Existing_EMI', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#6a0272')]), StyleInfo(locname=LocBody(columns='Existing_EMI', rows=[1]), grpname=None, colname='Existing_EMI', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#bd0237')]), StyleInfo(locname=LocBody(columns='Loan_Amount', rows=[0]), grpname=None, colname='Loan_Amount', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#a2024b')]), StyleInfo(locname=LocBody(columns='Loan_Amount', rows=[1]), grpname=None, colname='Loan_Amount', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#a2024b')]), StyleInfo(locname=LocBody(columns='Loan_Period', rows=[0]), grpname=None, colname='Loan_Period', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#9d024e')]), StyleInfo(locname=LocBody(columns='Loan_Period', rows=[1]), grpname=None, colname='Loan_Period', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#a1024b')]), StyleInfo(locname=LocBody(columns='Interest_Rate', rows=[0]), grpname=None, colname='Interest_Rate', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#b3023e')]), StyleInfo(locname=LocBody(columns='Interest_Rate', rows=[1]), grpname=None, colname='Interest_Rate', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#b10240')]), StyleInfo(locname=LocBody(columns='EMI', rows=[0]), grpname=None, colname='EMI', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#b10240')]), StyleInfo(locname=LocBody(columns='EMI', rows=[1]), grpname=None, colname='EMI', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#b1023f')]), StyleInfo(locname=LocBody(columns='Var1', rows=[0]), grpname=None, colname='Var1', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#8d0259')]), StyleInfo(locname=LocBody(columns='Var1', rows=[1]), grpname=None, colname='Var1', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#640277')]), StyleInfo(locname=LocBody(columns='Approved', rows=[0]), grpname=None, colname='Approved', rownum=0, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#610279')]), StyleInfo(locname=LocBody(columns='Approved', rows=[1]), grpname=None, colname='Approved', rownum=1, colnum=None, styles=[CellStyleText(color='#FFFFFF', font=None, size=None, align=None, v_align=None, style=None, weight=None, stretch=None, decorate=None, transform=None, whitespace=None), CellStyleFill(color='#610279')])], _locale=, _formats=[], _substitutions=[], _options=Options(table_id=OptionsInfo(scss=False, category='table', type='value', value=None), table_caption=OptionsInfo(scss=False, category='table', type='value', value=None), table_width=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_layout=OptionsInfo(scss=True, category='table', type='value', value='fixed'), table_margin_left=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_margin_right=OptionsInfo(scss=True, category='table', type='px', value='auto'), table_background_color=OptionsInfo(scss=True, category='table', type='value', value='#FFFFFF'), table_additional_css=OptionsInfo(scss=False, category='table', type='values', value=[]), table_font_names=OptionsInfo(scss=False, category='table', type='values', value=['-apple-system', 'BlinkMacSystemFont', 'Segoe UI', 'Roboto', 'Oxygen', 'Ubuntu', 'Cantarell', 'Helvetica Neue', 'Fira Sans', 'Droid Sans', 'Arial', 'sans-serif']), table_font_size=OptionsInfo(scss=True, category='table', type='px', value='16px'), table_font_weight=OptionsInfo(scss=True, category='table', type='value', value='normal'), table_font_style=OptionsInfo(scss=True, category='table', type='value', value='normal'), table_font_color=OptionsInfo(scss=True, category='table', type='value', value='#333333'), table_font_color_light=OptionsInfo(scss=True, category='table', type='value', value='#FFFFFF'), table_border_top_include=OptionsInfo(scss=False, category='table', type='boolean', value=True), table_border_top_style=OptionsInfo(scss=True, category='table', type='value', value='solid'), table_border_top_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_top_color=OptionsInfo(scss=True, category='table', type='value', value='#A8A8A8'), table_border_right_style=OptionsInfo(scss=True, category='table', type='value', value='none'), table_border_right_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_right_color=OptionsInfo(scss=True, category='table', type='value', value='#D3D3D3'), table_border_bottom_include=OptionsInfo(scss=False, category='table', type='boolean', value=True), table_border_bottom_style=OptionsInfo(scss=True, category='table', type='value', value='solid'), table_border_bottom_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_bottom_color=OptionsInfo(scss=True, category='table', type='value', value='#A8A8A8'), table_border_left_style=OptionsInfo(scss=True, category='table', type='value', value='none'), table_border_left_width=OptionsInfo(scss=True, category='table', type='px', value='2px'), table_border_left_color=OptionsInfo(scss=True, category='table', type='value', value='#D3D3D3'), heading_background_color=OptionsInfo(scss=True, category='heading', type='value', value=None), heading_align=OptionsInfo(scss=True, category='heading', type='value', value='center'), heading_title_font_size=OptionsInfo(scss=True, category='heading', type='px', value='125%'), heading_title_font_weight=OptionsInfo(scss=True, category='heading', type='value', value='initial'), heading_subtitle_font_size=OptionsInfo(scss=True, category='heading', type='px', value='85%'), heading_subtitle_font_weight=OptionsInfo(scss=True, category='heading', type='value', value='initial'), heading_padding=OptionsInfo(scss=True, category='heading', type='px', value='4px'), heading_padding_horizontal=OptionsInfo(scss=True, category='heading', type='px', value='5px'), heading_border_bottom_style=OptionsInfo(scss=True, category='heading', type='value', value='solid'), heading_border_bottom_width=OptionsInfo(scss=True, category='heading', type='px', value='2px'), heading_border_bottom_color=OptionsInfo(scss=True, category='heading', type='value', value='#D3D3D3'), heading_border_lr_style=OptionsInfo(scss=True, category='heading', type='value', value='none'), heading_border_lr_width=OptionsInfo(scss=True, category='heading', type='px', value='1px'), heading_border_lr_color=OptionsInfo(scss=True, category='heading', type='value', value='#D3D3D3'), column_labels_background_color=OptionsInfo(scss=True, category='column_labels', type='value', value=None), column_labels_font_size=OptionsInfo(scss=True, category='column_labels', type='px', value='100%'), column_labels_font_weight=OptionsInfo(scss=True, category='column_labels', type='value', value='normal'), column_labels_text_transform=OptionsInfo(scss=True, category='column_labels', type='value', value='inherit'), column_labels_padding=OptionsInfo(scss=True, category='column_labels', type='px', value='5px'), column_labels_padding_horizontal=OptionsInfo(scss=True, category='column_labels', type='px', value='5px'), column_labels_vlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='none'), column_labels_vlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), column_labels_vlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), column_labels_border_top_style=OptionsInfo(scss=True, category='column_labels', type='value', value='solid'), column_labels_border_top_width=OptionsInfo(scss=True, category='column_labels', type='px', value='2px'), column_labels_border_top_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_border_bottom_style=OptionsInfo(scss=True, category='column_labels', type='value', value='solid'), column_labels_border_bottom_width=OptionsInfo(scss=True, category='column_labels', type='px', value='2px'), column_labels_border_bottom_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_border_lr_style=OptionsInfo(scss=True, category='column_labels', type='value', value='none'), column_labels_border_lr_width=OptionsInfo(scss=True, category='column_labels', type='px', value='1px'), column_labels_border_lr_color=OptionsInfo(scss=True, category='column_labels', type='value', value='#D3D3D3'), column_labels_hidden=OptionsInfo(scss=False, category='column_labels', type='boolean', value=False), row_group_background_color=OptionsInfo(scss=True, category='row_group', type='value', value=None), row_group_font_size=OptionsInfo(scss=True, category='row_group', type='px', value='100%'), row_group_font_weight=OptionsInfo(scss=True, category='row_group', type='value', value='initial'), row_group_text_transform=OptionsInfo(scss=True, category='row_group', type='value', value='inherit'), row_group_padding=OptionsInfo(scss=True, category='row_group', type='px', value='8px'), row_group_padding_horizontal=OptionsInfo(scss=True, category='row_group', type='px', value='5px'), row_group_border_top_style=OptionsInfo(scss=True, category='row_group', type='value', value='solid'), row_group_border_top_width=OptionsInfo(scss=True, category='row_group', type='px', value='2px'), row_group_border_top_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_right_style=OptionsInfo(scss=True, category='row_group', type='value', value='none'), row_group_border_right_width=OptionsInfo(scss=True, category='row_group', type='px', value='1px'), row_group_border_right_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_bottom_style=OptionsInfo(scss=True, category='row_group', type='value', value='solid'), row_group_border_bottom_width=OptionsInfo(scss=True, category='row_group', type='px', value='2px'), row_group_border_bottom_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_border_left_style=OptionsInfo(scss=True, category='row_group', type='value', value='none'), row_group_border_left_width=OptionsInfo(scss=True, category='row_group', type='px', value='1px'), row_group_border_left_color=OptionsInfo(scss=True, category='row_group', type='value', value='#D3D3D3'), row_group_as_column=OptionsInfo(scss=False, category='row_group', type='boolean', value=False), table_body_hlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_hlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), table_body_hlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_vlines_style=OptionsInfo(scss=True, category='table_body', type='value', value='none'), table_body_vlines_width=OptionsInfo(scss=True, category='table_body', type='px', value='1px'), table_body_vlines_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_border_top_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_border_top_width=OptionsInfo(scss=True, category='table_body', type='px', value='2px'), table_body_border_top_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), table_body_border_bottom_style=OptionsInfo(scss=True, category='table_body', type='value', value='solid'), table_body_border_bottom_width=OptionsInfo(scss=True, category='table_body', type='px', value='2px'), table_body_border_bottom_color=OptionsInfo(scss=True, category='table_body', type='value', value='#D3D3D3'), data_row_padding=OptionsInfo(scss=True, category='data_row', type='px', value='8px'), data_row_padding_horizontal=OptionsInfo(scss=True, category='data_row', type='px', value='5px'), stub_background_color=OptionsInfo(scss=True, category='stub', type='value', value=None), stub_font_size=OptionsInfo(scss=True, category='stub', type='px', value='100%'), stub_font_weight=OptionsInfo(scss=True, category='stub', type='value', value='initial'), stub_text_transform=OptionsInfo(scss=True, category='stub', type='value', value='inherit'), stub_border_style=OptionsInfo(scss=True, category='stub', type='value', value='solid'), stub_border_width=OptionsInfo(scss=True, category='stub', type='px', value='2px'), stub_border_color=OptionsInfo(scss=True, category='stub', type='value', value='#D3D3D3'), stub_row_group_background_color=OptionsInfo(scss=True, category='stub', type='value', value=None), stub_row_group_font_size=OptionsInfo(scss=True, category='stub', type='px', value='100%'), stub_row_group_font_weight=OptionsInfo(scss=True, category='stub', type='value', value='initial'), stub_row_group_text_transform=OptionsInfo(scss=True, category='stub', type='value', value='inherit'), stub_row_group_border_style=OptionsInfo(scss=True, category='stub', type='value', value='solid'), stub_row_group_border_width=OptionsInfo(scss=True, category='stub', type='px', value='2px'), stub_row_group_border_color=OptionsInfo(scss=True, category='stub', type='value', value='#D3D3D3'), source_notes_padding=OptionsInfo(scss=True, category='source_notes', type='px', value='4px'), source_notes_padding_horizontal=OptionsInfo(scss=True, category='source_notes', type='px', value='5px'), source_notes_background_color=OptionsInfo(scss=True, category='source_notes', type='value', value=None), source_notes_font_size=OptionsInfo(scss=True, category='source_notes', type='px', value='90%'), source_notes_border_bottom_style=OptionsInfo(scss=True, category='source_notes', type='value', value='none'), source_notes_border_bottom_width=OptionsInfo(scss=True, category='source_notes', type='px', value='2px'), source_notes_border_bottom_color=OptionsInfo(scss=True, category='source_notes', type='value', value='#D3D3D3'), source_notes_border_lr_style=OptionsInfo(scss=True, category='source_notes', type='value', value='none'), source_notes_border_lr_width=OptionsInfo(scss=True, category='source_notes', type='px', value='2px'), source_notes_border_lr_color=OptionsInfo(scss=True, category='source_notes', type='value', value='#D3D3D3'), source_notes_multiline=OptionsInfo(scss=False, category='source_notes', type='boolean', value=True), source_notes_sep=OptionsInfo(scss=False, category='source_notes', type='value', value=' '), row_striping_background_color=OptionsInfo(scss=True, category='row', type='value', value='rgba(128,128,128,0.05)'), row_striping_include_stub=OptionsInfo(scss=False, category='row', type='boolean', value=False), row_striping_include_table_body=OptionsInfo(scss=False, category='row', type='boolean', value=False), container_width=OptionsInfo(scss=False, category='container', type='px', value='auto'), container_height=OptionsInfo(scss=False, category='container', type='px', value='auto'), container_padding_x=OptionsInfo(scss=False, category='container', type='px', value='0px'), container_padding_y=OptionsInfo(scss=False, category='container', type='px', value='10px'), container_overflow_x=OptionsInfo(scss=False, category='container', type='overflow', value='auto'), container_overflow_y=OptionsInfo(scss=False, category='container', type='overflow', value='auto'), quarto_disable_processing=OptionsInfo(scss=False, category='quarto', type='logical', value=False), quarto_use_bootstrap=OptionsInfo(scss=False, category='quarto', type='logical', value=False)), _has_built=False)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dia.plot_corr(subset=[\"Monthly_Income\", \"Existing_EMI\"], method=\"xi\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (7, 3)
column_1column_2null_corr
strstrf64
"City_Code""City_Category"1.0
"Employer_Code""Employer_Category1"1.0
"Customer_Existing_Primary_Bank…"Primary_Bank_Type"1.0
"Loan_Amount""Loan_Period"1.0
"Interest_Rate""EMI"1.0
"Employer_Code""Employer_Category2"0.964816
"Employer_Category1""Employer_Category2"0.964816
" + ], + "text/plain": [ + "shape: (7, 3)\n", + "┌─────────────────────────────────┬────────────────────┬───────────┐\n", + "│ column_1 ┆ column_2 ┆ null_corr │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ f64 │\n", + "╞═════════════════════════════════╪════════════════════╪═══════════╡\n", + "│ City_Code ┆ City_Category ┆ 1.0 │\n", + "│ Employer_Code ┆ Employer_Category1 ┆ 1.0 │\n", + "│ Customer_Existing_Primary_Bank… ┆ Primary_Bank_Type ┆ 1.0 │\n", + "│ Loan_Amount ┆ Loan_Period ┆ 1.0 │\n", + "│ Interest_Rate ┆ EMI ┆ 1.0 │\n", + "│ Employer_Code ┆ Employer_Category2 ┆ 0.964816 │\n", + "│ Employer_Category1 ┆ Employer_Category2 ┆ 0.964816 │\n", + "└─────────────────────────────────┴────────────────────┴───────────┘" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Correlation between A is null and B is null for A, B combinations in the subset\n", + "dia.null_corr(subset = pl.all()).filter(\n", + " pl.col(\"null_corr\").abs() > 0.7\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "02004006008001,0001,2001,4001,6001,8002,0002,2002,4002,6002,8003,0003,2003,4003,6003,8004,0004,2004,4004,6004,8005,0005,2005,4005,6005,8006,0006,2006,4006,6006,8007,0007,2007,4007,6007,8008,0008,2008,4008,6008,8009,0009,2009,4009,6009,80010,00010,20010,40010,60010,80011,00011,20011,40011,60011,80012,00012,20012,40012,60012,80013,00013,20013,40013,60013,80014,00005001,0001,5002,0002,500counts0.00.10.20.30.40.50.60.70.80.91.0(Null/NaN/Inf)%" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Plot a single feature's distribution, together with useful stats\n", + "df_bins, plot = dia.plot_feature(\n", + " \"EMI\", \n", + " n_bins=100, \n", + " density=False, \n", + ")\n", + "plot" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "01002003004005006007008009001,0001,1001,2001,3001,4001,5001,6001,7001,8001,9002,0002,1002,2002,3002,4002,5002,6002,7002,8002,9003,0003,1003,2003,3003,4003,5003,6003,7003,8003,9004,0004,1004,2004,3004,4004,50001002003004005006007008009001,000counts0.00.10.20.30.40.50.60.70.80.91.0(Null/NaN/Inf)%" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can provide a filter expression, which will be applied upfront\n", + "# You can see how because of this filter expression, null and extreme values are removed\n", + "df_bins, plot = dia.plot_feature(\n", + " \"EMI\", \n", + " n_bins=100, \n", + " density=False, \n", + " filter_by = pl.col(\"EMI\").is_between(pl.col(\"EMI\").quantile(0.01), pl.col(\"EMI\").quantile(0.99)),\n", + ")\n", + "plot" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "−2,00002,0004,0006,0008,00010,00012,00014,000EMI0.00000.00010.00020.00030.00040.00050.00060.00070.00080.0009density0.00.10.20.30.40.50.60.70.80.91.0(Null/NaN/Inf)%GPPrimary_Bank_TypeGPPrimary_Bank_TypeDistribution of EMI over Primary Bank Type" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dia.plot_feature_over(\n", + " \"EMI\", \n", + " segment = \"Primary_Bank_Type\",\n", + " # (pl.col(\"Loan_Amount\") > 10_000).alias(\"high_loan_amount\"), # The segment we want to use\n", + " n_bins=100, \n", + " density=True, \n", + " include_null_segment = False\n", + ").properties(\n", + " title = alt.Title(\"Distribution of EMI over Primary Bank Type\")\n", + ").interactive()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ML + ML Metrics Plots \n", + "\n", + "Example: Classic Iris Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 5)
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)species
f64f64f64f64str
5.13.51.40.2"setosa"
4.93.01.40.2"setosa"
4.73.21.30.2"setosa"
4.63.11.50.2"setosa"
5.03.61.40.2"setosa"
" + ], + "text/plain": [ + "shape: (5, 5)\n", + "┌───────────────────┬──────────────────┬───────────────────┬──────────────────┬─────────┐\n", + "│ sepal length (cm) ┆ sepal width (cm) ┆ petal length (cm) ┆ petal width (cm) ┆ species │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str │\n", + "╞═══════════════════╪══════════════════╪═══════════════════╪══════════════════╪═════════╡\n", + "│ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa │\n", + "│ 4.9 ┆ 3.0 ┆ 1.4 ┆ 0.2 ┆ setosa │\n", + "│ 4.7 ┆ 3.2 ┆ 1.3 ┆ 0.2 ┆ setosa │\n", + "│ 4.6 ┆ 3.1 ┆ 1.5 ┆ 0.2 ┆ setosa │\n", + "│ 5.0 ┆ 3.6 ┆ 1.4 ┆ 0.2 ┆ setosa │\n", + "└───────────────────┴──────────────────┴───────────────────┴──────────────────┴─────────┘" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import polars as pl\n", + "import polars_ds as pds\n", + "import altair as alt\n", + "# Only used to get dataset.\n", + "from sklearn import datasets\n", + "\n", + "dataset = datasets.load_iris()\n", + "df = pl.from_numpy(dataset.data, schema = dataset.feature_names).with_columns(\n", + " pl.Series(values=dataset.target).alias(\"species\")\n", + ").with_columns(\n", + " pl.when(pl.col(\"species\") == 0).then(pl.lit('setosa'))\n", + " .when(pl.col(\"species\") == 1).then(pl.lit('versicolor'))\n", + " .when(pl.col(\"species\") == 2).then(pl.lit('virginica')).alias(\"species\")\n", + ")\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "import polars_ds.eda.plots as pp" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "−4−3−2−101234pc1−1.0−0.50.00.51.0pc2setosaversicolorvirginicaspecies2 Principal Components Plot" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pp.plot_pca(\n", + " df,\n", + " [\"sepal length (cm)\", \"sepal width (cm)\", \"petal length (cm)\", \"petal width (cm)\"], \n", + " by = \"species\",\n", + " dim = 2\n", + ").properties(\n", + " title='2 Principal Components Plot'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "4.04.55.05.56.06.57.07.58.0sepal length (cm)y = 1.8584 * x - 7.1014, r2 = 0.7600012345678petal length (cm), y_predLinear Regression Chart" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Just for fun, let's see if there is any linear relationship between sepal length and petal length\n", + "plot = pp.plot_lin_reg(\n", + " df,\n", + " x = \"sepal length (cm)\", \n", + " target = \"petal length (cm)\", \n", + " add_bias=True,\n", + " show_lin_reg_eq=True # Prints out the linear regression result at the end\n", + ").properties(\n", + " title = \"Linear Regression Chart\"\n", + ")\n", + "plot.interactive()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "df = pl.DataFrame({\n", + " \"actual\": [0, 0, 1, 1, 1, 0],\n", + " \"pred\": [0.1, 0.4, 0.35, 0.8, 0.6, 0.23],\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "0.00.10.20.30.40.50.60.70.80.91.0False Positive Rate0.00.10.20.30.40.50.60.70.80.91.0True Positive RateMy Model (AUC = 0.8889)" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pp.plot_roc_auc(\n", + " df = df, \n", + " actual = \"actual\",\n", + " pred = \"pred\",\n", + " estimator_name = \"My Model\"\n", + ") # .interactive()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plots For Each Partition/Group/Segment/Category\n", + "\n", + "The PartitionHelper class can be used. Using the class is not necessary, but it simplifies the code and tries to catch common errors when it comes to partitions, and is recommended for beginners. \n", + "\n", + "If you are familiar with polars's partition function, you don't have to use this. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Linear Regression Plot on Species" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 5)
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)species
f64f64f64f64str
5.13.51.40.2"setosa"
4.93.01.40.2"setosa"
4.73.21.30.2"setosa"
4.63.11.50.2"setosa"
5.03.61.40.2"setosa"
" + ], + "text/plain": [ + "shape: (5, 5)\n", + "┌───────────────────┬──────────────────┬───────────────────┬──────────────────┬─────────┐\n", + "│ sepal length (cm) ┆ sepal width (cm) ┆ petal length (cm) ┆ petal width (cm) ┆ species │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str │\n", + "╞═══════════════════╪══════════════════╪═══════════════════╪══════════════════╪═════════╡\n", + "│ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa │\n", + "│ 4.9 ┆ 3.0 ┆ 1.4 ┆ 0.2 ┆ setosa │\n", + "│ 4.7 ┆ 3.2 ┆ 1.3 ┆ 0.2 ┆ setosa │\n", + "│ 4.6 ┆ 3.1 ┆ 1.5 ┆ 0.2 ┆ setosa │\n", + "│ 5.0 ┆ 3.6 ┆ 1.4 ┆ 0.2 ┆ setosa │\n", + "└───────────────────┴──────────────────┴───────────────────┴──────────────────┴─────────┘" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import polars as pl\n", + "import polars_ds as pds\n", + "import altair as alt\n", + "# Only used to get dataset.\n", + "from sklearn import datasets\n", + "\n", + "dataset = datasets.load_iris()\n", + "df = pl.from_numpy(dataset.data, schema = dataset.feature_names).with_columns(\n", + " pl.Series(values=dataset.target).alias(\"species\")\n", + ").with_columns(\n", + " pl.when(pl.col(\"species\") == 0).then(pl.lit('setosa'))\n", + " .when(pl.col(\"species\") == 1).then(pl.lit('versicolor'))\n", + " .when(pl.col(\"species\") == 2).then(pl.lit('virginica')).alias(\"species\")\n", + ")\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'setosa': alt.LayerChart(...), 'versicolor': alt.LayerChart(...), 'virginica': alt.LayerChart(...)}\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "4.24.44.64.85.05.25.45.65.8sepal length (cm)y = 0.1316 * x + 0.8031, r2 = 0.07140.00.20.40.60.81.01.21.41.61.82.0petal length (cm), y_predLinear Regression Chart on Species = setosa4.85.05.25.45.65.86.06.26.46.66.87.0sepal length (cm)y = 0.6865 * x + 0.1851, r2 = 0.56860.00.51.01.52.02.53.03.54.04.55.05.5petal length (cm), y_predLinear Regression Chart on Species = versicolor4.55.05.56.06.57.07.58.0sepal length (cm)y = 0.7501 * x + 0.6105, r2 = 0.746901234567petal length (cm), y_predLinear Regression Chart on Species = virginica" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from polars_ds.partition import PartitionHelper\n", + "\n", + "by = \"species\"\n", + "parts = PartitionHelper(df, by)\n", + "plots = parts.apply(\n", + " lambda p_name, sub_df: pp.plot_lin_reg(\n", + " sub_df,\n", + " x = \"sepal length (cm)\", \n", + " target = \"petal length (cm)\", \n", + " add_bias = True,\n", + " weights = None,\n", + " max_points = 2000,\n", + " ).properties(\n", + " title = f\"Linear Regression Chart on Species = {p_name}\"\n", + " )\n", + ")\n", + "\n", + "print(plots)\n", + "alt.vconcat(*plots.values()).configure(autosize=\"pad\")" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['setosa', 'versicolor', 'virginica']" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "parts.names()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 5)
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)species
f64f64f64f64str
6.33.36.02.5"virginica"
5.82.75.11.9"virginica"
7.13.05.92.1"virginica"
6.32.95.61.8"virginica"
6.53.05.82.2"virginica"
" + ], + "text/plain": [ + "shape: (5, 5)\n", + "┌───────────────────┬──────────────────┬───────────────────┬──────────────────┬───────────┐\n", + "│ sepal length (cm) ┆ sepal width (cm) ┆ petal length (cm) ┆ petal width (cm) ┆ species │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str │\n", + "╞═══════════════════╪══════════════════╪═══════════════════╪══════════════════╪═══════════╡\n", + "│ 6.3 ┆ 3.3 ┆ 6.0 ┆ 2.5 ┆ virginica │\n", + "│ 5.8 ┆ 2.7 ┆ 5.1 ┆ 1.9 ┆ virginica │\n", + "│ 7.1 ┆ 3.0 ┆ 5.9 ┆ 2.1 ┆ virginica │\n", + "│ 6.3 ┆ 2.9 ┆ 5.6 ┆ 1.8 ┆ virginica │\n", + "│ 6.5 ┆ 3.0 ┆ 5.8 ┆ 2.2 ┆ virginica │\n", + "└───────────────────┴──────────────────┴───────────────────┴──────────────────┴───────────┘" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "parts.get(\"virginica\").head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ROC AUC Plot on Categories" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "df = pl.DataFrame({\n", + " \"actual\": [0, 0, 1, 1, 1, 0],\n", + " \"pred\": [0.1, 0.4, 0.35, 0.8, 0.6, 0.23],\n", + " \"category\": [\"a\", \"a\", \"a\", \"b\", \"b\", \"b\"]\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "0.00.10.20.30.40.50.60.70.80.91.0False Positive Rate0.00.10.20.30.40.50.60.70.80.91.0True Positive RateMy Model (AUC = 0.5)ROC AUC on a0.00.10.20.30.40.50.60.70.80.91.0False Positive Rate0.00.10.20.30.40.50.60.70.80.91.0True Positive RateMy Model (AUC = 1.0)ROC AUC on b" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from polars_ds.partition import PartitionHelper\n", + "\n", + "plots = PartitionHelper(df, by = \"category\").apply(\n", + " lambda p_name, sub_df: pp.plot_roc_auc(\n", + " df = sub_df, \n", + " actual = \"actual\",\n", + " pred = \"pred\",\n", + " estimator_name = \"My Model\"\n", + " ).properties(\n", + " title = f\"ROC AUC on {p_name}\"\n", + " )\n", + ")\n", + "\n", + "alt.vconcat(*plots.values()).configure(autosize=\"pad\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/pipeline.ipynb b/examples/pipeline.ipynb index 86796d34..326e3a42 100644 --- a/examples/pipeline.ipynb +++ b/examples/pipeline.ipynb @@ -8,7 +8,7 @@ "source": [ "import polars as pl\n", "import polars.selectors as cs\n", - "from polars_ds.pipeline import Pipeline, Blueprint" + "from polars_ds.modeling import Pipeline, Blueprint" ] }, { @@ -95,8 +95,8 @@ "output_type": "stream", "text": [ "Blueprint name: example\n", - "Blueprint current steps: 15\n", - "Features Expected: ['ID', 'Gender', 'DOB', 'Lead_Creation_Date', 'City_Code', 'City_Category', 'Employer_Code', 'Employer_Category1', 'Employer_Category2', 'Monthly_Income', 'Customer_Existing_Primary_Bank_Code', 'Primary_Bank_Type', 'Contacted', 'Source', 'Source_Category', 'Existing_EMI', 'Loan_Amount', 'Loan_Period', 'Interest_Rate', 'EMI', 'Var1', 'Approved']\n", + "Column names: Lowercase all incoming columns.Blueprint current steps: 11\n", + "Features Expected: ['id', 'gender', 'dob', 'lead_creation_date', 'city_code', 'city_category', 'employer_code', 'employer_category1', 'employer_category2', 'monthly_income', 'customer_existing_primary_bank_code', 'primary_bank_type', 'contacted', 'source', 'source_category', 'existing_emi', 'loan_amount', 'loan_period', 'interest_rate', 'emi', 'var1', 'approved']\n", "\n" ] } @@ -109,15 +109,13 @@ "# and target will be auto-filled if the transformation requires a target field and when no target field is explicitly given.\n", "\n", "bp = (\n", - " Blueprint(df, name = \"example\", target = \"approved\") # You can optionally put target of the ML model here\n", + " Blueprint(df, name = \"example\", target = \"approved\", lowercase=True) # You can optionally put target of the ML model here\n", " # Select only the columns we need\n", - " .lowercase() # lowercase all columns\n", " .sql_transform(sql) # Run a SQL transform on the df\n", " # Say you want to remove a population for your data pipeline.\n", " .filter( \n", " \"city_category is not null\" # or equivalently, you can do: pl.col(\"city_category\").is_not_null()\n", " )\n", - " .select(cs.numeric() | cs.by_name([\"gender\", \"employer_category1\", \"city_category\", \"test_col\"]))\n", " # explicitly put target, since this is not the target for prediction. \n", " # Use a linear regression with x1 = var1, x2=existing_emi to predict missing values in loan_period\n", " .linear_impute(features = [\"var1\", \"existing_emi\"], target = \"loan_period\") \n", @@ -139,7 +137,6 @@ " .one_hot_encode(\"gender\", drop_first=True)\n", " .woe_encode(\"city_category\") # No need to specify target because we initialized bp with a target\n", " .target_encode(\"employer_category1\", min_samples_leaf = 20, smoothing = 10.0) # same as above\n", - " .shrink_dtype(force_f32 = True) # shrink dtype to smallest possible. Force floats to be f32\n", ")\n", "\n", "print(bp)" @@ -156,30 +153,6 @@ "Naive Query Steps: \n", "\n", "Step 1:\n", - "col(\"ID\").alias(\"id\"),\n", - "col(\"Gender\").alias(\"gender\"),\n", - "col(\"DOB\").alias(\"dob\"),\n", - "col(\"Lead_Creation_Date\").alias(\"lead_creation_date\"),\n", - "col(\"City_Code\").alias(\"city_code\"),\n", - "col(\"City_Category\").alias(\"city_category\"),\n", - "col(\"Employer_Code\").alias(\"employer_code\"),\n", - "col(\"Employer_Category1\").alias(\"employer_category1\"),\n", - "col(\"Employer_Category2\").alias(\"employer_category2\"),\n", - "col(\"Monthly_Income\").alias(\"monthly_income\"),\n", - "col(\"Customer_Existing_Primary_Bank_Code\").alias(\"customer_existing_primary_bank_code\"),\n", - "col(\"Primary_Bank_Type\").alias(\"primary_bank_type\"),\n", - "col(\"Contacted\").alias(\"contacted\"),\n", - "col(\"Source\").alias(\"source\"),\n", - "col(\"Source_Category\").alias(\"source_category\"),\n", - "col(\"Existing_EMI\").alias(\"existing_emi\"),\n", - "col(\"Loan_Amount\").alias(\"loan_amount\"),\n", - "col(\"Loan_Period\").alias(\"loan_period\"),\n", - "col(\"Interest_Rate\").alias(\"interest_rate\"),\n", - "col(\"EMI\").alias(\"emi\"),\n", - "col(\"Var1\").alias(\"var1\"),\n", - "col(\"Approved\").alias(\"approved\")\n", - "\n", - "Step 2:\n", "Run SQL: \n", "select\n", "*\n", @@ -189,26 +162,23 @@ "\n", "\n", "\n", - "Step 3:\n", + "Step 2:\n", "col(\"city_category\").is_not_null()\n", "\n", - "Step 4:\n", - "selector\n", - "\n", - "Step 5:\n", + "Step 3:\n", ".when(col(\"loan_period\").is_null()).then([(col(\"var1\")) * (dyn float: 0.50981)].sum_horizontal([[(col(\"existing_emi\")) * (dyn float: -0.000008)]])).otherwise(col(\"loan_period\").strict_cast(Float64)).alias(\"loan_period\")\n", "\n", - "Step 6:\n", + "Step 4:\n", "col(\"existing_emi\").fill_null([dyn float: 0.0])\n", "\n", - "Step 7:\n", + "Step 5:\n", "col(\"existing_emi\").log1p().alias(\"existing_emi_log1p\"),\n", "col(\"loan_amount\").log1p().alias(\"loan_amount_log1p\"),\n", "col(\"loan_amount\").clip([dyn int: 0, dyn int: 1000]).alias(\"loan_amount_log1p_clipped\"),\n", "col(\"loan_amount\").sqrt().alias(\"loan_amount_sqrt\"),\n", "col(\"loan_amount\").shift([dyn int: -1]).alias(\"loan_amount_lag_1\")\n", "\n", - "Step 8:\n", + "Step 6:\n", "[([(col(\"employer_category2\")) - (dyn float: 3.67927)]) / (dyn float: 0.862459)],\n", "[([(col(\"monthly_income\")) - (dyn float: 7463.79731)]) / (dyn float: 225051.544361)],\n", "[([(col(\"existing_emi\")) - (dyn float: 265.690248)]) / (dyn float: 2757.609254)],\n", @@ -221,26 +191,20 @@ "[([(col(\"loan_amount_sqrt\")) - (dyn float: 186.421961)]) / (dyn float: 68.387392)],\n", "[([(col(\"loan_amount_lag_1\")) - (dyn float: 39430.445444)]) / (dyn float: 30727.449733)]\n", "\n", - "Step 9:\n", + "Step 7:\n", "col(\"employer_category1\").is_null().strict_cast(UInt8).alias(\"employer_category1_is_missing\")\n", "\n", - "Step 10:\n", + "Step 8:\n", "[(col(\"gender\")) ==v (String(Male))].strict_cast(UInt8).alias(\"gender_Male\")\n", "\n", - "Step 11:\n", + "Step 9:\n", "*.exclude([Name(\"gender\")])\n", "\n", - "Step 12:\n", + "Step 10:\n", "col(\"city_category\").replace_strict([Series[value], Series[woe], null])\n", "\n", - "Step 13:\n", - "col(\"employer_category1\").replace_strict([Series[value], Series[to], null])\n", - "\n", - "Step 14:\n", - "dtype_columns([Int16, UInt32, UInt8, Int32, UInt64, Int8, UInt16, Int64]).shrink_dtype()\n", - "\n", - "Step 15:\n", - "dtype_columns([Float64, Float32]).strict_cast(Float32)\n" + "Step 11:\n", + "col(\"employer_category1\").replace_strict([Series[value], Series[to], null])\n" ] }, "execution_count": 5, @@ -270,24 +234,29 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 19)
city_categoryemployer_category1employer_category2monthly_incomeexisting_emiloan_amountloan_periodinterest_rateemivar1approvedtest_colexisting_emi_log1ploan_amount_log1ploan_amount_log1p_clippedloan_amount_sqrtloan_amount_lag_1employer_category1_is_missinggender_Male
f32f32f32f32f32f32f32f32f32i8i8strf32f32f32f32f32u8u8
0.0809590.026603-3.106548-0.017613-0.096348-0.632338-1.619415-1.019936-0.197259100"TEST"0.0-0.586105NaN-0.6580250.18125701
-0.4795530.0266030.371879-0.023167-0.0963480.1812730.093681nullnull00"TEST"0.00.537137NaN0.3759481.71083401
-0.4795530.0147370.371879-0.017613-0.0963481.7108610.950229nullnull70"TEST"0.01.527696NaN1.7092780.34397801
0.0809590.0147370.3718790.0112690.8102340.343995-1.619415nullnull100"TEST"7.8244460.683076NaN0.5437382.94751301
-0.0464750.0243360.3718790.000161-0.0963482.9475490.950229-0.7462922.631433100"TEST"0.02.00661NaN2.546276-0.30690601
" + "shape: (5, 29)
iddoblead_creation_datecity_codecity_categoryemployer_codeemployer_category1employer_category2monthly_incomecustomer_existing_primary_bank_codeprimary_bank_typecontactedsourcesource_categoryexisting_emiloan_amountloan_periodinterest_rateemivar1approvedtest_colexisting_emi_log1ploan_amount_log1ploan_amount_log1p_clippedloan_amount_sqrtloan_amount_lag_1employer_category1_is_missinggender_Male
strstrstrstrf64strf64f64f64strstrstrstrstrf64f64f64f64f64i64i64strf64f64f64f64f64u8u8
"APPD40611263344""07/12/86""04/07/16""C10003"0.080959"COM0000002"0.026603-3.106548-0.017613"B002""P""Y""S122""G"-0.096348-0.632338-1.619415-1.019936-0.197259100"TEST"0.0-0.586105NaN-0.6580250.18125701
"APPE70289249423""10/12/82""19/07/16""C10125"-0.479553"COM0005267"0.0266030.371879-0.023167"B003""G""Y""S143""B"-0.0963480.1812730.093681nullnull00"TEST"0.00.537137NaN0.3759481.71083401
"APPF80273865537""30/01/89""09/07/16""C10477"-0.479553"COM0004143"0.0147370.371879-0.017613"B003""G""Y""S143""B"-0.0963481.7108610.950229nullnull70"TEST"0.01.527696NaN1.7092780.34397801
"APPG60994436641""19/04/85""20/07/16""C10002"0.080959"COM0001781"0.0147370.3718790.011269"B001""P""Y""S134""B"0.8102340.343995-1.619415nullnull100"TEST"7.8244460.683076NaN0.5437382.94751301
"APPK80327232033""28/03/73""02/07/16""C10022"-0.046475"COM0030526"0.0243360.3718790.000161"B003""G""Y""S122""C"-0.0963482.9475490.950229-0.7462912.631433100"TEST"0.02.00661NaN2.546276-0.30690601
" ], "text/plain": [ - "shape: (5, 19)\n", - "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", - "│ city_cate ┆ employer_ ┆ employer_ ┆ monthly_i ┆ … ┆ loan_amou ┆ loan_amou ┆ employer_ ┆ gender_M │\n", - "│ gory ┆ category1 ┆ category2 ┆ ncome ┆ ┆ nt_sqrt ┆ nt_lag_1 ┆ category1 ┆ ale │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ _is_missi ┆ --- │\n", - "│ f32 ┆ f32 ┆ f32 ┆ f32 ┆ ┆ f32 ┆ f32 ┆ ng ┆ u8 │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ --- ┆ │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ u8 ┆ │\n", - "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", - "│ 0.080959 ┆ 0.026603 ┆ -3.106548 ┆ -0.017613 ┆ … ┆ -0.658025 ┆ 0.181257 ┆ 0 ┆ 1 │\n", - "│ -0.479553 ┆ 0.026603 ┆ 0.371879 ┆ -0.023167 ┆ … ┆ 0.375948 ┆ 1.710834 ┆ 0 ┆ 1 │\n", - "│ -0.479553 ┆ 0.014737 ┆ 0.371879 ┆ -0.017613 ┆ … ┆ 1.709278 ┆ 0.343978 ┆ 0 ┆ 1 │\n", - "│ 0.080959 ┆ 0.014737 ┆ 0.371879 ┆ 0.011269 ┆ … ┆ 0.543738 ┆ 2.947513 ┆ 0 ┆ 1 │\n", - "│ -0.046475 ┆ 0.024336 ┆ 0.371879 ┆ 0.000161 ┆ … ┆ 2.546276 ┆ -0.306906 ┆ 0 ┆ 1 │\n", - "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘" + "shape: (5, 29)\n", + "┌───────────┬──────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐\n", + "│ id ┆ dob ┆ lead_crea ┆ city_code ┆ … ┆ loan_amou ┆ loan_amou ┆ employer_ ┆ gender_Ma │\n", + "│ --- ┆ --- ┆ tion_date ┆ --- ┆ ┆ nt_sqrt ┆ nt_lag_1 ┆ category1 ┆ le │\n", + "│ str ┆ str ┆ --- ┆ str ┆ ┆ --- ┆ --- ┆ _is_missi ┆ --- │\n", + "│ ┆ ┆ str ┆ ┆ ┆ f64 ┆ f64 ┆ ng ┆ u8 │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ --- ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ u8 ┆ │\n", + "╞═══════════╪══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡\n", + "│ APPD40611 ┆ 07/12/86 ┆ 04/07/16 ┆ C10003 ┆ … ┆ -0.658025 ┆ 0.181257 ┆ 0 ┆ 1 │\n", + "│ 263344 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ APPE70289 ┆ 10/12/82 ┆ 19/07/16 ┆ C10125 ┆ … ┆ 0.375948 ┆ 1.710834 ┆ 0 ┆ 1 │\n", + "│ 249423 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ APPF80273 ┆ 30/01/89 ┆ 09/07/16 ┆ C10477 ┆ … ┆ 1.709278 ┆ 0.343978 ┆ 0 ┆ 1 │\n", + "│ 865537 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ APPG60994 ┆ 19/04/85 ┆ 20/07/16 ┆ C10002 ┆ … ┆ 0.543738 ┆ 2.947513 ┆ 0 ┆ 1 │\n", + "│ 436641 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ APPK80327 ┆ 28/03/73 ┆ 02/07/16 ┆ C10022 ┆ … ┆ 2.546276 ┆ -0.306906 ┆ 0 ┆ 1 │\n", + "│ 232033 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "└───────────┴──────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘" ] }, "execution_count": 6, @@ -304,13 +273,6 @@ "df_transformed.head()" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": 7, @@ -326,19 +288,18 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (0, 19)
city_categoryemployer_category1employer_category2monthly_incomeexisting_emiloan_amountloan_periodinterest_rateemivar1approvedtest_colexisting_emi_log1ploan_amount_log1ploan_amount_log1p_clippedloan_amount_sqrtloan_amount_lag_1employer_category1_is_missinggender_Male
f32f32f32f32f32f32f32f32f32i8i8strf32f32f32f32f32u8u8
" + "shape: (0, 29)
iddoblead_creation_datecity_codecity_categoryemployer_codeemployer_category1employer_category2monthly_incomecustomer_existing_primary_bank_codeprimary_bank_typecontactedsourcesource_categoryexisting_emiloan_amountloan_periodinterest_rateemivar1approvedtest_colexisting_emi_log1ploan_amount_log1ploan_amount_log1p_clippedloan_amount_sqrtloan_amount_lag_1employer_category1_is_missinggender_Male
strstrstrstrf64strf64f64f64strstrstrstrstrf64f64f64f64f64i64i64strf64f64f64f64f64u8u8
" ], "text/plain": [ - "shape: (0, 19)\n", - "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", - "│ city_cate ┆ employer_ ┆ employer_ ┆ monthly_i ┆ … ┆ loan_amou ┆ loan_amou ┆ employer_ ┆ gender_M │\n", - "│ gory ┆ category1 ┆ category2 ┆ ncome ┆ ┆ nt_sqrt ┆ nt_lag_1 ┆ category1 ┆ ale │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ _is_missi ┆ --- │\n", - "│ f32 ┆ f32 ┆ f32 ┆ f32 ┆ ┆ f32 ┆ f32 ┆ ng ┆ u8 │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ --- ┆ │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ u8 ┆ │\n", - "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", - "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘" + "shape: (0, 29)\n", + "┌─────┬─────┬──────────────┬───────────┬───┬─────────────┬─────────────┬─────────────┬─────────────┐\n", + "│ id ┆ dob ┆ lead_creatio ┆ city_code ┆ … ┆ loan_amount ┆ loan_amount ┆ employer_ca ┆ gender_Male │\n", + "│ --- ┆ --- ┆ n_date ┆ --- ┆ ┆ _sqrt ┆ _lag_1 ┆ tegory1_is_ ┆ --- │\n", + "│ str ┆ str ┆ --- ┆ str ┆ ┆ --- ┆ --- ┆ missing ┆ u8 │\n", + "│ ┆ ┆ str ┆ ┆ ┆ f64 ┆ f64 ┆ --- ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ u8 ┆ │\n", + "╞═════╪═════╪══════════════╪═══════════╪═══╪═════════════╪═════════════╪═════════════╪═════════════╡\n", + "└─────┴─────┴──────────────┴───────────┴───┴─────────────┴─────────────┴─────────────┴─────────────┘" ] }, "execution_count": 7, @@ -368,19 +329,18 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (0, 19)
city_categoryemployer_category1employer_category2monthly_incomeexisting_emiloan_amountloan_periodinterest_rateemivar1approvedtest_colexisting_emi_log1ploan_amount_log1ploan_amount_log1p_clippedloan_amount_sqrtloan_amount_lag_1employer_category1_is_missinggender_Male
f32f32f32f32f32f32f32f32f32i8i8strf32f32f32f32f32u8u8
" + "shape: (0, 29)
iddoblead_creation_datecity_codecity_categoryemployer_codeemployer_category1employer_category2monthly_incomecustomer_existing_primary_bank_codeprimary_bank_typecontactedsourcesource_categoryexisting_emiloan_amountloan_periodinterest_rateemivar1approvedtest_colexisting_emi_log1ploan_amount_log1ploan_amount_log1p_clippedloan_amount_sqrtloan_amount_lag_1employer_category1_is_missinggender_Male
strstrstrstrf64strf64f64f64strstrstrstrstrf64f64f64f64f64i64i64strf64f64f64f64f64u8u8
" ], "text/plain": [ - "shape: (0, 19)\n", - "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", - "│ city_cate ┆ employer_ ┆ employer_ ┆ monthly_i ┆ … ┆ loan_amou ┆ loan_amou ┆ employer_ ┆ gender_M │\n", - "│ gory ┆ category1 ┆ category2 ┆ ncome ┆ ┆ nt_sqrt ┆ nt_lag_1 ┆ category1 ┆ ale │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ _is_missi ┆ --- │\n", - "│ f32 ┆ f32 ┆ f32 ┆ f32 ┆ ┆ f32 ┆ f32 ┆ ng ┆ u8 │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ --- ┆ │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ u8 ┆ │\n", - "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", - "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘" + "shape: (0, 29)\n", + "┌─────┬─────┬──────────────┬───────────┬───┬─────────────┬─────────────┬─────────────┬─────────────┐\n", + "│ id ┆ dob ┆ lead_creatio ┆ city_code ┆ … ┆ loan_amount ┆ loan_amount ┆ employer_ca ┆ gender_Male │\n", + "│ --- ┆ --- ┆ n_date ┆ --- ┆ ┆ _sqrt ┆ _lag_1 ┆ tegory1_is_ ┆ --- │\n", + "│ str ┆ str ┆ --- ┆ str ┆ ┆ --- ┆ --- ┆ missing ┆ u8 │\n", + "│ ┆ ┆ str ┆ ┆ ┆ f64 ┆ f64 ┆ --- ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ u8 ┆ │\n", + "╞═════╪═════╪══════════════╪═══════════╪═══╪═════════════╪═════════════╪═════════════╪═════════════╡\n", + "└─────┴─────┴──────────────┴───────────┴───┴─────────────┴─────────────┴─────────────┴─────────────┘" ] }, "execution_count": 8, @@ -427,30 +387,6 @@ "Naive Query Steps: \n", "\n", "Step 1:\n", - "col(\"ID\").alias(\"id\"),\n", - "col(\"Gender\").alias(\"gender\"),\n", - "col(\"DOB\").alias(\"dob\"),\n", - "col(\"Lead_Creation_Date\").alias(\"lead_creation_date\"),\n", - "col(\"City_Code\").alias(\"city_code\"),\n", - "col(\"City_Category\").alias(\"city_category\"),\n", - "col(\"Employer_Code\").alias(\"employer_code\"),\n", - "col(\"Employer_Category1\").alias(\"employer_category1\"),\n", - "col(\"Employer_Category2\").alias(\"employer_category2\"),\n", - "col(\"Monthly_Income\").alias(\"monthly_income\"),\n", - "col(\"Customer_Existing_Primary_Bank_Code\").alias(\"customer_existing_primary_bank_code\"),\n", - "col(\"Primary_Bank_Type\").alias(\"primary_bank_type\"),\n", - "col(\"Contacted\").alias(\"contacted\"),\n", - "col(\"Source\").alias(\"source\"),\n", - "col(\"Source_Category\").alias(\"source_category\"),\n", - "col(\"Existing_EMI\").alias(\"existing_emi\"),\n", - "col(\"Loan_Amount\").alias(\"loan_amount\"),\n", - "col(\"Loan_Period\").alias(\"loan_period\"),\n", - "col(\"Interest_Rate\").alias(\"interest_rate\"),\n", - "col(\"EMI\").alias(\"emi\"),\n", - "col(\"Var1\").alias(\"var1\"),\n", - "col(\"Approved\").alias(\"approved\")\n", - "\n", - "Step 2:\n", "Run SQL: \n", "select\n", "*\n", @@ -460,26 +396,23 @@ "\n", "\n", "\n", - "Step 3:\n", + "Step 2:\n", "col(\"city_category\").is_not_null()\n", "\n", - "Step 4:\n", - "selector\n", - "\n", - "Step 5:\n", + "Step 3:\n", ".when(col(\"loan_period\").is_null()).then([(col(\"var1\")) * (dyn float: 0.50981)].sum_horizontal([[(col(\"existing_emi\")) * (dyn float: -0.000008)]])).otherwise(col(\"loan_period\").strict_cast(Float64)).alias(\"loan_period\")\n", "\n", - "Step 6:\n", + "Step 4:\n", "col(\"existing_emi\").fill_null([dyn float: 0.0])\n", "\n", - "Step 7:\n", + "Step 5:\n", "col(\"existing_emi\").log1p().alias(\"existing_emi_log1p\"),\n", "col(\"loan_amount\").log1p().alias(\"loan_amount_log1p\"),\n", "col(\"loan_amount\").clip([dyn int: 0, dyn int: 1000]).alias(\"loan_amount_log1p_clipped\"),\n", "col(\"loan_amount\").sqrt().alias(\"loan_amount_sqrt\"),\n", "col(\"loan_amount\").shift([dyn int: -1]).alias(\"loan_amount_lag_1\")\n", "\n", - "Step 8:\n", + "Step 6:\n", "[([(col(\"employer_category2\")) - (dyn float: 3.67927)]) / (dyn float: 0.862459)],\n", "[([(col(\"monthly_income\")) - (dyn float: 7463.79731)]) / (dyn float: 225051.544361)],\n", "[([(col(\"existing_emi\")) - (dyn float: 265.690248)]) / (dyn float: 2757.609254)],\n", @@ -492,26 +425,20 @@ "[([(col(\"loan_amount_sqrt\")) - (dyn float: 186.421961)]) / (dyn float: 68.387392)],\n", "[([(col(\"loan_amount_lag_1\")) - (dyn float: 39430.445444)]) / (dyn float: 30727.449733)]\n", "\n", - "Step 9:\n", + "Step 7:\n", "col(\"employer_category1\").is_null().strict_cast(UInt8).alias(\"employer_category1_is_missing\")\n", "\n", - "Step 10:\n", + "Step 8:\n", "[(col(\"gender\")) ==v (String(Male))].strict_cast(UInt8).alias(\"gender_Male\")\n", "\n", - "Step 11:\n", + "Step 9:\n", "*.exclude([Name(\"gender\")])\n", "\n", - "Step 12:\n", + "Step 10:\n", "col(\"city_category\").replace_strict([Series[value], Series[woe], null])\n", "\n", - "Step 13:\n", - "col(\"employer_category1\").replace_strict([Series[value], Series[to], null])\n", - "\n", - "Step 14:\n", - "dtype_columns([Int16, UInt32, UInt8, Int32, UInt64, Int8, UInt16, Int64]).shrink_dtype()\n", - "\n", - "Step 15:\n", - "dtype_columns([Float64, Float32]).strict_cast(Float32)\n" + "Step 11:\n", + "col(\"employer_category1\").replace_strict([Series[value], Series[to], null])\n" ] }, "execution_count": 10, @@ -541,30 +468,40 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (42_004, 19)
city_categoryemployer_category1employer_category2monthly_incomeexisting_emiloan_amountloan_periodinterest_rateemivar1approvedtest_colexisting_emi_log1ploan_amount_log1ploan_amount_log1p_clippedloan_amount_sqrtloan_amount_lag_1employer_category1_is_missinggender_Male
f32f32f32f32f32f32f32f32f32i8i8strf32f32f32f32f32u8u8
0.0809590.026603-3.106548-0.017613-0.096348-0.632338-1.619415-1.019936-0.197259100"TEST"0.0-0.586105NaN-0.6580250.18125701
-0.4795530.0266030.371879-0.023167-0.0963480.1812730.093681nullnull00"TEST"0.00.537137NaN0.3759481.71083401
-0.4795530.0147370.371879-0.017613-0.0963481.7108610.950229nullnull70"TEST"0.01.527696NaN1.7092780.34397801
0.0809590.0147370.3718790.0112690.8102340.343995-1.619415nullnull100"TEST"7.8244460.683076NaN0.5437382.94751301
-0.0464750.0243360.3718790.000161-0.0963482.9475490.950229-0.7462922.631433100"TEST"0.02.00661NaN2.546276-0.30690601
0.0809590.024336-1.947072-0.012725-0.096348-0.306893-0.762867-1.062693-0.120198100"TEST"0.0-0.024488NaN-0.193265-0.30690601
-0.4795530.014737-3.106548-0.022501-0.096348-0.306893-0.762867nullnull20"TEST"0.0-0.024488NaN-0.193265-0.50217101
-0.0464750.0266030.371879-0.026055-0.096348-0.502160.0936812.785431-0.21054620"TEST"0.0-0.333569NaN-0.460651.32030300
0.0809590.026603-0.7875970.0107940.3990091.3203280.950229nullnull100"TEST"7.2203741.334103NaN1.409920.96231701
0.0809590.014737-3.106548-0.014369-0.0963480.9623390.093681-0.8933751.041041100"TEST"0.01.129211NaN1.115067null01
" + "shape: (42_004, 29)
iddoblead_creation_datecity_codecity_categoryemployer_codeemployer_category1employer_category2monthly_incomecustomer_existing_primary_bank_codeprimary_bank_typecontactedsourcesource_categoryexisting_emiloan_amountloan_periodinterest_rateemivar1approvedtest_colexisting_emi_log1ploan_amount_log1ploan_amount_log1p_clippedloan_amount_sqrtloan_amount_lag_1employer_category1_is_missinggender_Male
strstrstrstrf64strf64f64f64strstrstrstrstrf64f64f64f64f64i64i64strf64f64f64f64f64u8u8
"APPD40611263344""07/12/86""04/07/16""C10003"0.080959"COM0000002"0.026603-3.106548-0.017613"B002""P""Y""S122""G"-0.096348-0.632338-1.619415-1.019936-0.197259100"TEST"0.0-0.586105NaN-0.6580250.18125701
"APPE70289249423""10/12/82""19/07/16""C10125"-0.479553"COM0005267"0.0266030.371879-0.023167"B003""G""Y""S143""B"-0.0963480.1812730.093681nullnull00"TEST"0.00.537137NaN0.3759481.71083401
"APPF80273865537""30/01/89""09/07/16""C10477"-0.479553"COM0004143"0.0147370.371879-0.017613"B003""G""Y""S143""B"-0.0963481.7108610.950229nullnull70"TEST"0.01.527696NaN1.7092780.34397801
"APPG60994436641""19/04/85""20/07/16""C10002"0.080959"COM0001781"0.0147370.3718790.011269"B001""P""Y""S134""B"0.8102340.343995-1.619415nullnull100"TEST"7.8244460.683076NaN0.5437382.94751301
"APPK80327232033""28/03/73""02/07/16""C10022"-0.046475"COM0030526"0.0243360.3718790.000161"B003""G""Y""S122""C"-0.0963482.9475490.950229-0.7462912.631433100"TEST"0.02.00661NaN2.546276-0.30690601
"APPS20215136404""04/03/86""30/09/16""C10002"0.080959"COM0000003"0.024336-1.947072-0.012725"B001""P""Y""S122""G"-0.096348-0.306893-0.762867-1.062693-0.120198100"TEST"0.0-0.024488NaN-0.193265-0.30690601
"APPT50870248519""03/03/91""30/09/16""C10041"-0.479553"COM0000009"0.014737-3.106548-0.022501"B003""G""Y""S122""G"-0.096348-0.306893-0.762867nullnull20"TEST"0.0-0.024488NaN-0.193265-0.50217101
"APPW50697209842""01/02/92""30/09/16""C10022"-0.046475"COM0013284"0.0266030.371879-0.026055"B030""P""Y""S122""G"-0.096348-0.502160.0936812.785431-0.21054620"TEST"0.0-0.333569NaN-0.460651.32030300
"APPY50870035036""27/06/78""30/09/16""C10002"0.080959"COM0000098"0.026603-0.7875970.010794"B002""P""Y""S122""G"0.3990091.3203280.950229nullnull100"TEST"7.2203741.334103NaN1.409920.96231701
"APPZ60733046119""31/12/89""30/09/16""C10003"0.080959"COM0000056"0.014737-3.106548-0.014369nullnull"Y""S122""G"-0.0963480.9623390.093681-0.8933751.041041100"TEST"0.01.129211NaN1.115067null01
" ], "text/plain": [ - "shape: (42_004, 19)\n", - "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", - "│ city_cate ┆ employer_ ┆ employer_ ┆ monthly_i ┆ … ┆ loan_amou ┆ loan_amou ┆ employer_ ┆ gender_M │\n", - "│ gory ┆ category1 ┆ category2 ┆ ncome ┆ ┆ nt_sqrt ┆ nt_lag_1 ┆ category1 ┆ ale │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ _is_missi ┆ --- │\n", - "│ f32 ┆ f32 ┆ f32 ┆ f32 ┆ ┆ f32 ┆ f32 ┆ ng ┆ u8 │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ --- ┆ │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ u8 ┆ │\n", - "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", - "│ 0.080959 ┆ 0.026603 ┆ -3.106548 ┆ -0.017613 ┆ … ┆ -0.658025 ┆ 0.181257 ┆ 0 ┆ 1 │\n", - "│ -0.479553 ┆ 0.026603 ┆ 0.371879 ┆ -0.023167 ┆ … ┆ 0.375948 ┆ 1.710834 ┆ 0 ┆ 1 │\n", - "│ -0.479553 ┆ 0.014737 ┆ 0.371879 ┆ -0.017613 ┆ … ┆ 1.709278 ┆ 0.343978 ┆ 0 ┆ 1 │\n", - "│ 0.080959 ┆ 0.014737 ┆ 0.371879 ┆ 0.011269 ┆ … ┆ 0.543738 ┆ 2.947513 ┆ 0 ┆ 1 │\n", - "│ -0.046475 ┆ 0.024336 ┆ 0.371879 ┆ 0.000161 ┆ … ┆ 2.546276 ┆ -0.306906 ┆ 0 ┆ 1 │\n", - "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", - "│ 0.080959 ┆ 0.024336 ┆ -1.947072 ┆ -0.012725 ┆ … ┆ -0.193265 ┆ -0.306906 ┆ 0 ┆ 1 │\n", - "│ -0.479553 ┆ 0.014737 ┆ -3.106548 ┆ -0.022501 ┆ … ┆ -0.193265 ┆ -0.502171 ┆ 0 ┆ 1 │\n", - "│ -0.046475 ┆ 0.026603 ┆ 0.371879 ┆ -0.026055 ┆ … ┆ -0.46065 ┆ 1.320303 ┆ 0 ┆ 0 │\n", - "│ 0.080959 ┆ 0.026603 ┆ -0.787597 ┆ 0.010794 ┆ … ┆ 1.40992 ┆ 0.962317 ┆ 0 ┆ 1 │\n", - "│ 0.080959 ┆ 0.014737 ┆ -3.106548 ┆ -0.014369 ┆ … ┆ 1.115067 ┆ null ┆ 0 ┆ 1 │\n", - "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘" + "shape: (42_004, 29)\n", + "┌───────────┬──────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐\n", + "│ id ┆ dob ┆ lead_crea ┆ city_code ┆ … ┆ loan_amou ┆ loan_amou ┆ employer_ ┆ gender_Ma │\n", + "│ --- ┆ --- ┆ tion_date ┆ --- ┆ ┆ nt_sqrt ┆ nt_lag_1 ┆ category1 ┆ le │\n", + "│ str ┆ str ┆ --- ┆ str ┆ ┆ --- ┆ --- ┆ _is_missi ┆ --- │\n", + "│ ┆ ┆ str ┆ ┆ ┆ f64 ┆ f64 ┆ ng ┆ u8 │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ --- ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ u8 ┆ │\n", + "╞═══════════╪══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡\n", + "│ APPD40611 ┆ 07/12/86 ┆ 04/07/16 ┆ C10003 ┆ … ┆ -0.658025 ┆ 0.181257 ┆ 0 ┆ 1 │\n", + "│ 263344 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ APPE70289 ┆ 10/12/82 ┆ 19/07/16 ┆ C10125 ┆ … ┆ 0.375948 ┆ 1.710834 ┆ 0 ┆ 1 │\n", + "│ 249423 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ APPF80273 ┆ 30/01/89 ┆ 09/07/16 ┆ C10477 ┆ … ┆ 1.709278 ┆ 0.343978 ┆ 0 ┆ 1 │\n", + "│ 865537 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ APPG60994 ┆ 19/04/85 ┆ 20/07/16 ┆ C10002 ┆ … ┆ 0.543738 ┆ 2.947513 ┆ 0 ┆ 1 │\n", + "│ 436641 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ APPK80327 ┆ 28/03/73 ┆ 02/07/16 ┆ C10022 ┆ … ┆ 2.546276 ┆ -0.306906 ┆ 0 ┆ 1 │\n", + "│ 232033 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ APPS20215 ┆ 04/03/86 ┆ 30/09/16 ┆ C10002 ┆ … ┆ -0.193265 ┆ -0.306906 ┆ 0 ┆ 1 │\n", + "│ 136404 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ APPT50870 ┆ 03/03/91 ┆ 30/09/16 ┆ C10041 ┆ … ┆ -0.193265 ┆ -0.502171 ┆ 0 ┆ 1 │\n", + "│ 248519 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ APPW50697 ┆ 01/02/92 ┆ 30/09/16 ┆ C10022 ┆ … ┆ -0.46065 ┆ 1.320303 ┆ 0 ┆ 0 │\n", + "│ 209842 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ APPY50870 ┆ 27/06/78 ┆ 30/09/16 ┆ C10002 ┆ … ┆ 1.40992 ┆ 0.962317 ┆ 0 ┆ 1 │\n", + "│ 035036 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ APPZ60733 ┆ 31/12/89 ┆ 30/09/16 ┆ C10003 ┆ … ┆ 1.115067 ┆ null ┆ 0 ┆ 1 │\n", + "│ 046119 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "└───────────┴──────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘" ] }, "execution_count": 11, @@ -598,32 +535,42 @@ "text/plain": [ "{'name': 'example',\n", " 'target': 'approved',\n", - " 'feature_names_in_': ['ID',\n", - " 'Gender',\n", - " 'DOB',\n", - " 'Lead_Creation_Date',\n", - " 'City_Code',\n", - " 'City_Category',\n", - " 'Employer_Code',\n", - " 'Employer_Category1',\n", - " 'Employer_Category2',\n", - " 'Monthly_Income',\n", - " 'Customer_Existing_Primary_Bank_Code',\n", - " 'Primary_Bank_Type',\n", - " 'Contacted',\n", - " 'Source',\n", - " 'Source_Category',\n", - " 'Existing_EMI',\n", - " 'Loan_Amount',\n", - " 'Loan_Period',\n", - " 'Interest_Rate',\n", - " 'EMI',\n", - " 'Var1',\n", - " 'Approved'],\n", - " 'feature_names_out_': ['city_category',\n", + " 'feature_names_in_': ['id',\n", + " 'gender',\n", + " 'dob',\n", + " 'lead_creation_date',\n", + " 'city_code',\n", + " 'city_category',\n", + " 'employer_code',\n", + " 'employer_category1',\n", + " 'employer_category2',\n", + " 'monthly_income',\n", + " 'customer_existing_primary_bank_code',\n", + " 'primary_bank_type',\n", + " 'contacted',\n", + " 'source',\n", + " 'source_category',\n", + " 'existing_emi',\n", + " 'loan_amount',\n", + " 'loan_period',\n", + " 'interest_rate',\n", + " 'emi',\n", + " 'var1',\n", + " 'approved'],\n", + " 'feature_names_out_': ['id',\n", + " 'dob',\n", + " 'lead_creation_date',\n", + " 'city_code',\n", + " 'city_category',\n", + " 'employer_code',\n", " 'employer_category1',\n", " 'employer_category2',\n", " 'monthly_income',\n", + " 'customer_existing_primary_bank_code',\n", + " 'primary_bank_type',\n", + " 'contacted',\n", + " 'source',\n", + " 'source_category',\n", " 'existing_emi',\n", " 'loan_amount',\n", " 'loan_period',\n", @@ -639,58 +586,35 @@ " 'loan_amount_lag_1',\n", " 'employer_category1_is_missing',\n", " 'gender_Male'],\n", - " 'transforms': [{'SelectStep': ['{\"Alias\":[{\"Column\":\"ID\"},\"id\"]}',\n", - " '{\"Alias\":[{\"Column\":\"Gender\"},\"gender\"]}',\n", - " '{\"Alias\":[{\"Column\":\"DOB\"},\"dob\"]}',\n", - " '{\"Alias\":[{\"Column\":\"Lead_Creation_Date\"},\"lead_creation_date\"]}',\n", - " '{\"Alias\":[{\"Column\":\"City_Code\"},\"city_code\"]}',\n", - " '{\"Alias\":[{\"Column\":\"City_Category\"},\"city_category\"]}',\n", - " '{\"Alias\":[{\"Column\":\"Employer_Code\"},\"employer_code\"]}',\n", - " '{\"Alias\":[{\"Column\":\"Employer_Category1\"},\"employer_category1\"]}',\n", - " '{\"Alias\":[{\"Column\":\"Employer_Category2\"},\"employer_category2\"]}',\n", - " '{\"Alias\":[{\"Column\":\"Monthly_Income\"},\"monthly_income\"]}',\n", - " '{\"Alias\":[{\"Column\":\"Customer_Existing_Primary_Bank_Code\"},\"customer_existing_primary_bank_code\"]}',\n", - " '{\"Alias\":[{\"Column\":\"Primary_Bank_Type\"},\"primary_bank_type\"]}',\n", - " '{\"Alias\":[{\"Column\":\"Contacted\"},\"contacted\"]}',\n", - " '{\"Alias\":[{\"Column\":\"Source\"},\"source\"]}',\n", - " '{\"Alias\":[{\"Column\":\"Source_Category\"},\"source_category\"]}',\n", - " '{\"Alias\":[{\"Column\":\"Existing_EMI\"},\"existing_emi\"]}',\n", - " '{\"Alias\":[{\"Column\":\"Loan_Amount\"},\"loan_amount\"]}',\n", - " '{\"Alias\":[{\"Column\":\"Loan_Period\"},\"loan_period\"]}',\n", - " '{\"Alias\":[{\"Column\":\"Interest_Rate\"},\"interest_rate\"]}',\n", - " '{\"Alias\":[{\"Column\":\"EMI\"},\"emi\"]}',\n", - " '{\"Alias\":[{\"Column\":\"Var1\"},\"var1\"]}',\n", - " '{\"Alias\":[{\"Column\":\"Approved\"},\"approved\"]}']},\n", - " {'SQLStep': \"\\nselect\\n*\\n, 'TEST' as test_col\\nfrom df\\nwhere loan_amount is not null\\n\"},\n", - " {'FilterStep': ['{\"Function\":{\"input\":[{\"Column\":\"city_category\"}],\"function\":{\"Boolean\":\"IsNotNull\"},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n", - " {'SelectStep': ['{\"Selector\":{\"Add\":[{\"Root\":{\"DtypeColumn\":[\"Int16\",\"UInt32\",\"UInt8\",\"Float64\",\"Int32\",\"UInt64\",\"Int8\",\"UInt16\",\"Int64\",{\"Decimal\":[null,null]},\"Float32\"]}},{\"Root\":{\"Columns\":[\"gender\",\"employer_category1\",\"city_category\",\"test_col\"]}}]}}']},\n", - " {'WithColumnsStep': ['{\"Alias\":[{\"Ternary\":{\"predicate\":{\"Function\":{\"input\":[{\"Column\":\"loan_period\"}],\"function\":{\"Boolean\":\"IsNull\"},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"truthy\":{\"Function\":{\"input\":[{\"BinaryExpr\":{\"left\":{\"Column\":\"var1\"},\"op\":\"Multiply\",\"right\":{\"Literal\":{\"Float\":0.5098100117596667}}}},{\"BinaryExpr\":{\"left\":{\"Column\":\"existing_emi\"},\"op\":\"Multiply\",\"right\":{\"Literal\":{\"Float\":-7.6040796537530525e-6}}}}],\"function\":{\"SumHorizontal\":{\"ignore_nulls\":true}},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE | INPUT_WILDCARD_EXPANSION\"}}},\"falsy\":{\"Cast\":{\"expr\":{\"Column\":\"loan_period\"},\"dtype\":\"Float64\",\"options\":\"Strict\"}}}},\"loan_period\"]}']},\n", - " {'WithColumnsStep': ['{\"Function\":{\"input\":[{\"Column\":\"existing_emi\"},{\"Literal\":{\"Float\":0.0}}],\"function\":\"FillNull\",\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n", - " {'WithColumnsStep': ['{\"Alias\":[{\"Function\":{\"input\":[{\"Column\":\"existing_emi\"}],\"function\":\"Log1p\",\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"existing_emi_log1p\"]}',\n", - " '{\"Alias\":[{\"Function\":{\"input\":[{\"Column\":\"loan_amount\"}],\"function\":\"Log1p\",\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"loan_amount_log1p\"]}',\n", - " '{\"Alias\":[{\"Function\":{\"input\":[{\"Column\":\"loan_amount\"},{\"Literal\":{\"Int\":0}},{\"Literal\":{\"Int\":1000}}],\"function\":{\"Clip\":{\"has_min\":true,\"has_max\":true}},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"loan_amount_log1p_clipped\"]}',\n", - " '{\"Alias\":[{\"Function\":{\"input\":[{\"Column\":\"loan_amount\"}],\"function\":{\"Pow\":\"Sqrt\"},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"loan_amount_sqrt\"]}',\n", - " '{\"Alias\":[{\"Function\":{\"input\":[{\"Column\":\"loan_amount\"},{\"Literal\":{\"Int\":-1}}],\"function\":\"Shift\",\"options\":{\"collect_groups\":\"GroupWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"loan_amount_lag_1\"]}']},\n", - " {'WithColumnsStep': ['{\"BinaryExpr\":{\"left\":{\"BinaryExpr\":{\"left\":{\"Column\":\"employer_category2\"},\"op\":\"Minus\",\"right\":{\"Literal\":{\"Float\":3.679269695227142}}}},\"op\":\"TrueDivide\",\"right\":{\"Literal\":{\"Float\":0.8624587860675418}}}}',\n", - " '{\"BinaryExpr\":{\"left\":{\"BinaryExpr\":{\"left\":{\"Column\":\"monthly_income\"},\"op\":\"Minus\",\"right\":{\"Literal\":{\"Float\":7463.797309780022}}}},\"op\":\"TrueDivide\",\"right\":{\"Literal\":{\"Float\":225051.544361042}}}}',\n", - " '{\"BinaryExpr\":{\"left\":{\"BinaryExpr\":{\"left\":{\"Column\":\"existing_emi\"},\"op\":\"Minus\",\"right\":{\"Literal\":{\"Float\":265.69024752404533}}}},\"op\":\"TrueDivide\",\"right\":{\"Literal\":{\"Float\":2757.6092535058197}}}}',\n", - " '{\"BinaryExpr\":{\"left\":{\"BinaryExpr\":{\"left\":{\"Column\":\"loan_amount\"},\"op\":\"Minus\",\"right\":{\"Literal\":{\"Float\":39429.98285877536}}}},\"op\":\"TrueDivide\",\"right\":{\"Literal\":{\"Float\":30727.230218132237}}}}',\n", + " 'transforms': [{'SQLStep': \"\\nselect\\n*\\n, 'TEST' as test_col\\nfrom df\\nwhere loan_amount is not null\\n\"},\n", + " {'FilterStep': ['{\"Function\":{\"input\":[{\"Column\":\"city_category\"}],\"function\":{\"Boolean\":\"IsNotNull\"},\"options\":{\"collect_groups\":\"ElementWise\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n", + " {'WithColumnsStep': ['{\"Alias\":[{\"Ternary\":{\"predicate\":{\"Function\":{\"input\":[{\"Column\":\"loan_period\"}],\"function\":{\"Boolean\":\"IsNull\"},\"options\":{\"collect_groups\":\"ElementWise\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"truthy\":{\"Function\":{\"input\":[{\"BinaryExpr\":{\"left\":{\"Column\":\"var1\"},\"op\":\"Multiply\",\"right\":{\"Literal\":{\"Float\":0.5098100117596667}}}},{\"BinaryExpr\":{\"left\":{\"Column\":\"existing_emi\"},\"op\":\"Multiply\",\"right\":{\"Literal\":{\"Float\":-7.6040796537530525e-6}}}}],\"function\":{\"SumHorizontal\":{\"ignore_nulls\":true}},\"options\":{\"collect_groups\":\"ElementWise\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE | INPUT_WILDCARD_EXPANSION\"}}},\"falsy\":{\"Cast\":{\"expr\":{\"Column\":\"loan_period\"},\"dtype\":\"Float64\",\"options\":\"Strict\"}}}},\"loan_period\"]}']},\n", + " {'WithColumnsStep': ['{\"Function\":{\"input\":[{\"Column\":\"existing_emi\"},{\"Literal\":{\"Float\":0.0}}],\"function\":\"FillNull\",\"options\":{\"collect_groups\":\"ElementWise\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n", + " {'WithColumnsStep': ['{\"Alias\":[{\"Function\":{\"input\":[{\"Column\":\"existing_emi\"}],\"function\":\"Log1p\",\"options\":{\"collect_groups\":\"ElementWise\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"existing_emi_log1p\"]}',\n", + " '{\"Alias\":[{\"Function\":{\"input\":[{\"Column\":\"loan_amount\"}],\"function\":\"Log1p\",\"options\":{\"collect_groups\":\"ElementWise\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"loan_amount_log1p\"]}',\n", + " '{\"Alias\":[{\"Function\":{\"input\":[{\"Column\":\"loan_amount\"},{\"Literal\":{\"Int\":0}},{\"Literal\":{\"Int\":1000}}],\"function\":{\"Clip\":{\"has_min\":true,\"has_max\":true}},\"options\":{\"collect_groups\":\"ElementWise\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"loan_amount_log1p_clipped\"]}',\n", + " '{\"Alias\":[{\"Function\":{\"input\":[{\"Column\":\"loan_amount\"}],\"function\":{\"Pow\":\"Sqrt\"},\"options\":{\"collect_groups\":\"ElementWise\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"loan_amount_sqrt\"]}',\n", + " '{\"Alias\":[{\"Function\":{\"input\":[{\"Column\":\"loan_amount\"},{\"Literal\":{\"Int\":-1}}],\"function\":\"Shift\",\"options\":{\"collect_groups\":\"GroupWise\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"loan_amount_lag_1\"]}']},\n", + " {'WithColumnsStep': ['{\"BinaryExpr\":{\"left\":{\"BinaryExpr\":{\"left\":{\"Column\":\"employer_category2\"},\"op\":\"Minus\",\"right\":{\"Literal\":{\"Float\":3.679269695227142}}}},\"op\":\"TrueDivide\",\"right\":{\"Literal\":{\"Float\":0.862458786067542}}}}',\n", + " '{\"BinaryExpr\":{\"left\":{\"BinaryExpr\":{\"left\":{\"Column\":\"monthly_income\"},\"op\":\"Minus\",\"right\":{\"Literal\":{\"Float\":7463.797309780022}}}},\"op\":\"TrueDivide\",\"right\":{\"Literal\":{\"Float\":225051.54436104206}}}}',\n", + " '{\"BinaryExpr\":{\"left\":{\"BinaryExpr\":{\"left\":{\"Column\":\"existing_emi\"},\"op\":\"Minus\",\"right\":{\"Literal\":{\"Float\":265.6902475240454}}}},\"op\":\"TrueDivide\",\"right\":{\"Literal\":{\"Float\":2757.6092535058183}}}}',\n", + " '{\"BinaryExpr\":{\"left\":{\"BinaryExpr\":{\"left\":{\"Column\":\"loan_amount\"},\"op\":\"Minus\",\"right\":{\"Literal\":{\"Float\":39429.98285877536}}}},\"op\":\"TrueDivide\",\"right\":{\"Literal\":{\"Float\":30727.23021813224}}}}',\n", " '{\"BinaryExpr\":{\"left\":{\"BinaryExpr\":{\"left\":{\"Column\":\"loan_period\"},\"op\":\"Minus\",\"right\":{\"Literal\":{\"Float\":3.8906294638605847}}}},\"op\":\"TrueDivide\",\"right\":{\"Literal\":{\"Float\":1.1674769269411276}}}}',\n", - " '{\"BinaryExpr\":{\"left\":{\"BinaryExpr\":{\"left\":{\"Column\":\"interest_rate\"},\"op\":\"Minus\",\"right\":{\"Literal\":{\"Float\":19.21356976117795}}}},\"op\":\"TrueDivide\",\"right\":{\"Literal\":{\"Float\":5.8470045315482775}}}}',\n", - " '{\"BinaryExpr\":{\"left\":{\"BinaryExpr\":{\"left\":{\"Column\":\"emi\"},\"op\":\"Minus\",\"right\":{\"Literal\":{\"Float\":1101.4662416950978}}}},\"op\":\"TrueDivide\",\"right\":{\"Literal\":{\"Float\":752.644499525087}}}}',\n", - " '{\"BinaryExpr\":{\"left\":{\"BinaryExpr\":{\"left\":{\"Column\":\"loan_amount_log1p\"},\"op\":\"Minus\",\"right\":{\"Literal\":{\"Float\":10.326664500589034}}}},\"op\":\"TrueDivide\",\"right\":{\"Literal\":{\"Float\":0.7219306388560957}}}}',\n", + " '{\"BinaryExpr\":{\"left\":{\"BinaryExpr\":{\"left\":{\"Column\":\"interest_rate\"},\"op\":\"Minus\",\"right\":{\"Literal\":{\"Float\":19.213569761177947}}}},\"op\":\"TrueDivide\",\"right\":{\"Literal\":{\"Float\":5.8470045315482775}}}}',\n", + " '{\"BinaryExpr\":{\"left\":{\"BinaryExpr\":{\"left\":{\"Column\":\"emi\"},\"op\":\"Minus\",\"right\":{\"Literal\":{\"Float\":1101.4662416950978}}}},\"op\":\"TrueDivide\",\"right\":{\"Literal\":{\"Float\":752.6444995250869}}}}',\n", + " '{\"BinaryExpr\":{\"left\":{\"BinaryExpr\":{\"left\":{\"Column\":\"loan_amount_log1p\"},\"op\":\"Minus\",\"right\":{\"Literal\":{\"Float\":10.326664500589036}}}},\"op\":\"TrueDivide\",\"right\":{\"Literal\":{\"Float\":0.7219306388560954}}}}',\n", " '{\"BinaryExpr\":{\"left\":{\"BinaryExpr\":{\"left\":{\"Column\":\"loan_amount_log1p_clipped\"},\"op\":\"Minus\",\"right\":{\"Literal\":{\"Float\":1000.0}}}},\"op\":\"TrueDivide\",\"right\":{\"Literal\":{\"Float\":0.0}}}}',\n", - " '{\"BinaryExpr\":{\"left\":{\"BinaryExpr\":{\"left\":{\"Column\":\"loan_amount_sqrt\"},\"op\":\"Minus\",\"right\":{\"Literal\":{\"Float\":186.42196063934287}}}},\"op\":\"TrueDivide\",\"right\":{\"Literal\":{\"Float\":68.38739247959863}}}}',\n", - " '{\"BinaryExpr\":{\"left\":{\"BinaryExpr\":{\"left\":{\"Column\":\"loan_amount_lag_1\"},\"op\":\"Minus\",\"right\":{\"Literal\":{\"Float\":39430.44544437302}}}},\"op\":\"TrueDivide\",\"right\":{\"Literal\":{\"Float\":30727.449732966954}}}}']},\n", - " {'WithColumnsStep': ['{\"Alias\":[{\"Cast\":{\"expr\":{\"Function\":{\"input\":[{\"Column\":\"employer_category1\"}],\"function\":{\"Boolean\":\"IsNull\"},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"dtype\":\"UInt8\",\"options\":\"Strict\"}},\"employer_category1_is_missing\"]}']},\n", + " '{\"BinaryExpr\":{\"left\":{\"BinaryExpr\":{\"left\":{\"Column\":\"loan_amount_sqrt\"},\"op\":\"Minus\",\"right\":{\"Literal\":{\"Float\":186.4219606393428}}}},\"op\":\"TrueDivide\",\"right\":{\"Literal\":{\"Float\":68.38739247959866}}}}',\n", + " '{\"BinaryExpr\":{\"left\":{\"BinaryExpr\":{\"left\":{\"Column\":\"loan_amount_lag_1\"},\"op\":\"Minus\",\"right\":{\"Literal\":{\"Float\":39430.44544437302}}}},\"op\":\"TrueDivide\",\"right\":{\"Literal\":{\"Float\":30727.449732966947}}}}']},\n", + " {'WithColumnsStep': ['{\"Alias\":[{\"Cast\":{\"expr\":{\"Function\":{\"input\":[{\"Column\":\"employer_category1\"}],\"function\":{\"Boolean\":\"IsNull\"},\"options\":{\"collect_groups\":\"ElementWise\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"dtype\":\"UInt8\",\"options\":\"Strict\"}},\"employer_category1_is_missing\"]}']},\n", " {'WithColumnsStep': ['{\"Alias\":[{\"Cast\":{\"expr\":{\"BinaryExpr\":{\"left\":{\"Column\":\"gender\"},\"op\":\"EqValidity\",\"right\":{\"Literal\":{\"String\":\"Male\"}}}},\"dtype\":\"UInt8\",\"options\":\"Strict\"}},\"gender_Male\"]}']},\n", " {'SelectStep': ['{\"Exclude\":[\"Wildcard\",[{\"Name\":\"gender\"}]]}']},\n", - " {'WithColumnsStep': ['{\"Function\":{\"input\":[{\"Column\":\"city_category\"},{\"Literal\":{\"Series\":{\"name\":\"value\",\"datatype\":\"String\",\"bit_settings\":\"\",\"values\":[\"A\",\"C\",\"B\"]}}},{\"Literal\":{\"Series\":{\"name\":\"woe\",\"datatype\":\"Float64\",\"bit_settings\":\"\",\"values\":[0.0809586180645928,-0.47955283435510176,-0.04647519483535344]}}},{\"Literal\":\"Null\"}],\"function\":{\"ReplaceStrict\":{\"return_dtype\":null}},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n", - " {'WithColumnsStep': ['{\"Function\":{\"input\":[{\"Column\":\"employer_category1\"},{\"Literal\":{\"Series\":{\"name\":\"value\",\"datatype\":\"String\",\"bit_settings\":\"\",\"values\":[\"B\",\"C\",\"A\"]}}},{\"Literal\":{\"Series\":{\"name\":\"to\",\"datatype\":\"Float64\",\"bit_settings\":\"\",\"values\":[0.024335548172757474,0.02660307366189719,0.014736842105263158]}}},{\"Literal\":\"Null\"}],\"function\":{\"ReplaceStrict\":{\"return_dtype\":null}},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n", - " {'WithColumnsStep': ['{\"Function\":{\"input\":[{\"DtypeColumn\":[\"Int16\",\"UInt32\",\"UInt8\",\"Int32\",\"UInt64\",\"Int8\",\"UInt16\",\"Int64\"]}],\"function\":\"ShrinkType\",\"options\":{\"collect_groups\":\"GroupWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n", - " {'WithColumnsStep': ['{\"Cast\":{\"expr\":{\"DtypeColumn\":[\"Float64\",\"Float32\"]},\"dtype\":\"Float32\",\"options\":\"Strict\"}}']}],\n", + " {'WithColumnsStep': ['{\"Function\":{\"input\":[{\"Column\":\"city_category\"},{\"Literal\":{\"Series\":[255,255,255,255,176,0,0,0,4,0,0,0,242,255,255,255,20,0,0,0,4,0,1,0,0,0,10,0,11,0,8,0,10,0,4,0,242,255,255,255,72,0,0,0,16,0,0,0,0,0,10,0,12,0,0,0,4,0,8,0,1,0,0,0,4,0,0,0,244,255,255,255,24,0,0,0,12,0,0,0,8,0,12,0,4,0,8,0,1,0,0,0,48,0,0,0,9,0,0,0,95,80,76,95,70,76,65,71,83,0,0,0,1,0,0,0,4,0,0,0,236,255,255,255,44,0,0,0,32,0,0,0,24,0,0,0,1,24,0,0,16,0,18,0,4,0,16,0,17,0,8,0,0,0,12,0,0,0,0,0,252,255,255,255,4,0,4,0,5,0,0,0,118,97,108,117,101,0,0,0,0,0,0,0,255,255,255,255,168,0,0,0,4,0,0,0,236,255,255,255,64,0,0,0,0,0,0,0,20,0,0,0,4,0,3,0,12,0,19,0,16,0,18,0,12,0,4,0,226,255,255,255,3,0,0,0,0,0,0,0,96,0,0,0,52,0,0,0,36,0,0,0,20,0,0,0,0,0,14,0,28,0,4,0,12,0,16,0,20,0,24,0,1,0,0,0,0,0,0,0,0,0,0,0,250,255,255,255,1,0,6,0,5,0,4,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,42,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,48,0,0,0,0,0,0,0,40,181,47,253,0,88,205,0,0,144,1,0,0,0,66,0,1,0,0,0,65,0,1,0,0,0,67,0,3,84,6,0,7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]}},{\"Literal\":{\"Series\":[255,255,255,255,176,0,0,0,4,0,0,0,242,255,255,255,20,0,0,0,4,0,1,0,0,0,10,0,11,0,8,0,10,0,4,0,242,255,255,255,72,0,0,0,16,0,0,0,0,0,10,0,12,0,0,0,4,0,8,0,1,0,0,0,4,0,0,0,244,255,255,255,24,0,0,0,12,0,0,0,8,0,12,0,4,0,8,0,1,0,0,0,48,0,0,0,9,0,0,0,95,80,76,95,70,76,65,71,83,0,0,0,1,0,0,0,4,0,0,0,236,255,255,255,48,0,0,0,32,0,0,0,24,0,0,0,1,3,0,0,16,0,18,0,4,0,16,0,17,0,8,0,0,0,12,0,0,0,0,0,250,255,255,255,2,0,6,0,6,0,4,0,3,0,0,0,119,111,101,0,0,0,0,0,255,255,255,255,152,0,0,0,4,0,0,0,236,255,255,255,64,0,0,0,0,0,0,0,20,0,0,0,4,0,3,0,12,0,19,0,16,0,18,0,12,0,4,0,228,255,255,255,3,0,0,0,0,0,0,0,80,0,0,0,36,0,0,0,20,0,0,0,0,0,0,0,12,0,24,0,4,0,12,0,16,0,20,0,250,255,255,255,1,0,6,0,5,0,4,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,41,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,40,181,47,253,0,88,193,0,0,158,64,201,195,152,203,167,191,39,178,234,56,180,185,180,63,214,151,16,95,254,176,222,191,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]}},{\"Literal\":\"Null\"}],\"function\":{\"ReplaceStrict\":{\"return_dtype\":null}},\"options\":{\"collect_groups\":\"ElementWise\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n", + " {'WithColumnsStep': ['{\"Function\":{\"input\":[{\"Column\":\"employer_category1\"},{\"Literal\":{\"Series\":[255,255,255,255,176,0,0,0,4,0,0,0,242,255,255,255,20,0,0,0,4,0,1,0,0,0,10,0,11,0,8,0,10,0,4,0,242,255,255,255,72,0,0,0,16,0,0,0,0,0,10,0,12,0,0,0,4,0,8,0,1,0,0,0,4,0,0,0,244,255,255,255,24,0,0,0,12,0,0,0,8,0,12,0,4,0,8,0,1,0,0,0,48,0,0,0,9,0,0,0,95,80,76,95,70,76,65,71,83,0,0,0,1,0,0,0,4,0,0,0,236,255,255,255,44,0,0,0,32,0,0,0,24,0,0,0,1,24,0,0,16,0,18,0,4,0,16,0,17,0,8,0,0,0,12,0,0,0,0,0,252,255,255,255,4,0,4,0,5,0,0,0,118,97,108,117,101,0,0,0,0,0,0,0,255,255,255,255,168,0,0,0,4,0,0,0,236,255,255,255,64,0,0,0,0,0,0,0,20,0,0,0,4,0,3,0,12,0,19,0,16,0,18,0,12,0,4,0,226,255,255,255,3,0,0,0,0,0,0,0,96,0,0,0,52,0,0,0,36,0,0,0,20,0,0,0,0,0,14,0,28,0,4,0,12,0,16,0,20,0,24,0,1,0,0,0,0,0,0,0,0,0,0,0,250,255,255,255,1,0,6,0,5,0,4,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,42,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,48,0,0,0,0,0,0,0,40,181,47,253,0,88,205,0,0,144,1,0,0,0,65,0,1,0,0,0,66,0,1,0,0,0,67,0,3,84,6,0,7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]}},{\"Literal\":{\"Series\":[255,255,255,255,176,0,0,0,4,0,0,0,242,255,255,255,20,0,0,0,4,0,1,0,0,0,10,0,11,0,8,0,10,0,4,0,242,255,255,255,72,0,0,0,16,0,0,0,0,0,10,0,12,0,0,0,4,0,8,0,1,0,0,0,4,0,0,0,244,255,255,255,24,0,0,0,12,0,0,0,8,0,12,0,4,0,8,0,1,0,0,0,48,0,0,0,9,0,0,0,95,80,76,95,70,76,65,71,83,0,0,0,1,0,0,0,4,0,0,0,236,255,255,255,48,0,0,0,32,0,0,0,24,0,0,0,1,3,0,0,16,0,18,0,4,0,16,0,17,0,8,0,0,0,12,0,0,0,0,0,250,255,255,255,2,0,6,0,6,0,4,0,2,0,0,0,116,111,0,0,0,0,0,0,255,255,255,255,152,0,0,0,4,0,0,0,236,255,255,255,64,0,0,0,0,0,0,0,20,0,0,0,4,0,3,0,12,0,19,0,16,0,18,0,12,0,4,0,228,255,255,255,3,0,0,0,0,0,0,0,80,0,0,0,36,0,0,0,20,0,0,0,0,0,0,0,12,0,24,0,4,0,12,0,16,0,20,0,250,255,255,255,1,0,6,0,5,0,4,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,41,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,40,181,47,253,0,88,193,0,0,127,124,27,119,89,46,142,63,212,255,32,254,106,235,152,63,131,89,103,13,214,61,155,63,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]}},{\"Literal\":\"Null\"}],\"function\":{\"ReplaceStrict\":{\"return_dtype\":null}},\"options\":{\"collect_groups\":\"ElementWise\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']}],\n", " 'ensure_features_in': False,\n", - " 'ensure_features_out': True}" + " 'ensure_features_out': True,\n", + " 'lowercase': True,\n", + " 'uppercase': False}" ] }, "execution_count": 13, @@ -805,30 +729,6 @@ "Naive Query Steps: \n", "\n", "Step 1:\n", - "col(\"ID\").alias(\"id\"),\n", - "col(\"Gender\").alias(\"gender\"),\n", - "col(\"DOB\").alias(\"dob\"),\n", - "col(\"Lead_Creation_Date\").alias(\"lead_creation_date\"),\n", - "col(\"City_Code\").alias(\"city_code\"),\n", - "col(\"City_Category\").alias(\"city_category\"),\n", - "col(\"Employer_Code\").alias(\"employer_code\"),\n", - "col(\"Employer_Category1\").alias(\"employer_category1\"),\n", - "col(\"Employer_Category2\").alias(\"employer_category2\"),\n", - "col(\"Monthly_Income\").alias(\"monthly_income\"),\n", - "col(\"Customer_Existing_Primary_Bank_Code\").alias(\"customer_existing_primary_bank_code\"),\n", - "col(\"Primary_Bank_Type\").alias(\"primary_bank_type\"),\n", - "col(\"Contacted\").alias(\"contacted\"),\n", - "col(\"Source\").alias(\"source\"),\n", - "col(\"Source_Category\").alias(\"source_category\"),\n", - "col(\"Existing_EMI\").alias(\"existing_emi\"),\n", - "col(\"Loan_Amount\").alias(\"loan_amount\"),\n", - "col(\"Loan_Period\").alias(\"loan_period\"),\n", - "col(\"Interest_Rate\").alias(\"interest_rate\"),\n", - "col(\"EMI\").alias(\"emi\"),\n", - "col(\"Var1\").alias(\"var1\"),\n", - "col(\"Approved\").alias(\"approved\")\n", - "\n", - "Step 2:\n", "col(\"var1\").fill_null([dyn float: 0.5]),\n", "col(\"existing_emi\").fill_null([dyn float: 0.5]),\n", "col(\"loan_amount\").fill_null([dyn float: 5000.5])\n" @@ -841,8 +741,7 @@ ], "source": [ "bp = (\n", - " Blueprint(df, name = \"example\", target = \"approved\")\n", - " .lowercase() # lowercase all columns\n", + " Blueprint(df, name = \"example\", target = \"approved\", lowercase=True)\n", " .append_fit_func(smallest_abs_impute, [\"var1\", \"existing_emi\", \"loan_amount\"], epsilon = 0.5)\n", " # Use append_fit_func for custom transforms\n", ")\n", @@ -1029,8 +928,9 @@ "metadata": {}, "outputs": [], "source": [ - "from polars_ds.pipeline import Blueprint, FitStep\n", - "from typing import Self, Union, List\n", + "from polars_ds.modeling import Blueprint, FitStep\n", + "from typing import Union, List\n", + "from typing_extensions import Self\n", "from functools import partial\n", "\n", "def smallest_abs_impute(df:Union[pl.DataFrame, pl.LazyFrame], cols: List[str], epsilon:float = 0.01) -> List[pl.Expr]:\n", @@ -1200,7 +1100,7 @@ } ], "source": [ - "import polars_ds.pipeline as pds_pipe\n", + "import polars_ds.modeling as pm\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.base import BaseEstimator, TransformerMixin\n", "\n", @@ -1212,8 +1112,7 @@ " def fit(self, df, y=None):\n", " # specify all the rules for the transform here\n", " bp = (\n", - " pds_pipe.Blueprint(df, name = \"example\", target = \"approved\") \n", - " .lowercase() \n", + " pm.Blueprint(df, name = \"example\", target = \"approved\", lowercase=True) \n", " .filter( \n", " \"city_category is not null\" # or equivalently, you can do: pl.col(\"city_category\").is_not_null()\n", " )\n", @@ -1264,7 +1163,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.7" + "version": "3.13.1" } }, "nbformat": 4, diff --git a/examples/sample_and_split.ipynb b/examples/sample_and_split.ipynb index 8eb12e1c..1376af17 100644 --- a/examples/sample_and_split.ipynb +++ b/examples/sample_and_split.ipynb @@ -24,7 +24,7 @@ "source": [ "import polars as pl\n", "import polars_ds as pds\n", - "import polars_ds.sample_and_split as sa" + "import polars_ds.sample_and_split as ss" ] }, { @@ -42,21 +42,21 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 8)
row_numuniform_1uniform_2expnormalfat_normalflagscategory
i64f64f64f64f64f64i32str
05.3554620.2275850.8754131.255306-1534.2960750"A"
13.1437420.6517112.12331-0.27767544.7987710"A"
29.5851380.7201471.048850.019822388.7244410"A"
311.730430.0596023.624234-1.177224442.3975180"A"
41.3104150.7838363.703261.501242189.0644922"A"
" + "shape: (5, 8)
row_numuniform_1uniform_2expnormalfat_normalflagscategory
i64f64f64f64f64f64i32str
06.1231570.013650.4850730.524003-703.1781040"A"
12.7756940.6902482.850941-1.966617-143.7885060"A"
22.0496790.5575662.132329-0.160845-71.0748231"A"
311.2618760.5618434.7373461.611832-300.3124192"A"
47.9551850.2238381.9777460.272536-398.717871"A"
" ], "text/plain": [ "shape: (5, 8)\n", - "┌─────────┬───────────┬───────────┬──────────┬───────────┬──────────────┬───────┬──────────┐\n", - "│ row_num ┆ uniform_1 ┆ uniform_2 ┆ exp ┆ normal ┆ fat_normal ┆ flags ┆ category │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ i32 ┆ str │\n", - "╞═════════╪═══════════╪═══════════╪══════════╪═══════════╪══════════════╪═══════╪══════════╡\n", - "│ 0 ┆ 5.355462 ┆ 0.227585 ┆ 0.875413 ┆ 1.255306 ┆ -1534.296075 ┆ 0 ┆ A │\n", - "│ 1 ┆ 3.143742 ┆ 0.651711 ┆ 2.12331 ┆ -0.27767 ┆ 544.798771 ┆ 0 ┆ A │\n", - "│ 2 ┆ 9.585138 ┆ 0.720147 ┆ 1.04885 ┆ 0.01982 ┆ 2388.724441 ┆ 0 ┆ A │\n", - "│ 3 ┆ 11.73043 ┆ 0.059602 ┆ 3.624234 ┆ -1.177224 ┆ 442.397518 ┆ 0 ┆ A │\n", - "│ 4 ┆ 1.310415 ┆ 0.783836 ┆ 3.70326 ┆ 1.501242 ┆ 189.064492 ┆ 2 ┆ A │\n", - "└─────────┴───────────┴───────────┴──────────┴───────────┴──────────────┴───────┴──────────┘" + "┌─────────┬───────────┬───────────┬──────────┬───────────┬─────────────┬───────┬──────────┐\n", + "│ row_num ┆ uniform_1 ┆ uniform_2 ┆ exp ┆ normal ┆ fat_normal ┆ flags ┆ category │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ i32 ┆ str │\n", + "╞═════════╪═══════════╪═══════════╪══════════╪═══════════╪═════════════╪═══════╪══════════╡\n", + "│ 0 ┆ 6.123157 ┆ 0.01365 ┆ 0.485073 ┆ 0.524003 ┆ -703.178104 ┆ 0 ┆ A │\n", + "│ 1 ┆ 2.775694 ┆ 0.690248 ┆ 2.850941 ┆ -1.966617 ┆ -143.788506 ┆ 0 ┆ A │\n", + "│ 2 ┆ 2.049679 ┆ 0.557566 ┆ 2.132329 ┆ -0.160845 ┆ -71.074823 ┆ 1 ┆ A │\n", + "│ 3 ┆ 11.261876 ┆ 0.561843 ┆ 4.737346 ┆ 1.611832 ┆ -300.312419 ┆ 2 ┆ A │\n", + "│ 4 ┆ 7.955185 ┆ 0.223838 ┆ 1.977746 ┆ 0.272536 ┆ -398.71787 ┆ 1 ┆ A │\n", + "└─────────┴───────────┴───────────┴──────────┴───────────┴─────────────┴───────┴──────────┘" ] }, "execution_count": 2, @@ -94,7 +94,7 @@ } ], "source": [ - "sa.random_cols(df.columns, 2, keep = [\"row_num\"])" + "ss.random_cols(df.columns, 2, keep = [\"row_num\"])" ] }, { @@ -112,7 +112,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (60_000, 8)
row_numuniform_1uniform_2expnormalfat_normalflagscategory
i64f64f64f64f64f64i32str
13.1437420.6517112.12331-0.27767544.7987710"A"
29.5851380.7201471.048850.019822388.7244410"A"
60.1896620.06511.316939-0.244435748.9951790"A"
70.6613460.8740924.8430380.31243-383.6591350"A"
80.0538010.9833420.4523620.312257-386.6897190"A"
999947.5361220.114142.847801-0.916853-1340.1115132"C"
9999610.0305770.9395680.9877190.701578-768.0626550"C"
999975.1185980.5523952.390273-2.57956-1076.6100990"C"
999985.7014280.5215721.290974-1.3617795.2780611"C"
999997.9460390.2251552.5649990.367505-1021.4799371"C"
" + "shape: (60_000, 8)
row_numuniform_1uniform_2expnormalfat_normalflagscategory
i64f64f64f64f64f64i32str
12.7756940.6902482.850941-1.966617-143.7885060"A"
22.0496790.5575662.132329-0.160845-71.0748231"A"
311.2618760.5618434.7373461.611832-300.3124192"A"
47.9551850.2238381.9777460.272536-398.717871"A"
63.5760430.1483580.240111-1.077243-1401.0359220"A"
999901.2106440.2454342.107333-0.943427-105.6826450"C"
999947.4091020.8318711.3370310.203354-616.6256190"C"
999952.3741910.9947810.468766-1.096312579.2030780"C"
999969.6011290.7182861.2477331.5185551059.5358280"C"
999974.2474730.0576530.5545831.383503-925.2460720"C"
" ], "text/plain": [ "shape: (60_000, 8)\n", @@ -121,17 +121,17 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ i32 ┆ str │\n", "╞═════════╪═══════════╪═══════════╪══════════╪═══════════╪══════════════╪═══════╪══════════╡\n", - "│ 1 ┆ 3.143742 ┆ 0.651711 ┆ 2.12331 ┆ -0.27767 ┆ 544.798771 ┆ 0 ┆ A │\n", - "│ 2 ┆ 9.585138 ┆ 0.720147 ┆ 1.04885 ┆ 0.01982 ┆ 2388.724441 ┆ 0 ┆ A │\n", - "│ 6 ┆ 0.189662 ┆ 0.0651 ┆ 1.316939 ┆ -0.244435 ┆ 748.995179 ┆ 0 ┆ A │\n", - "│ 7 ┆ 0.661346 ┆ 0.874092 ┆ 4.843038 ┆ 0.31243 ┆ -383.659135 ┆ 0 ┆ A │\n", - "│ 8 ┆ 0.053801 ┆ 0.983342 ┆ 0.452362 ┆ 0.312257 ┆ -386.689719 ┆ 0 ┆ A │\n", + "│ 1 ┆ 2.775694 ┆ 0.690248 ┆ 2.850941 ┆ -1.966617 ┆ -143.788506 ┆ 0 ┆ A │\n", + "│ 2 ┆ 2.049679 ┆ 0.557566 ┆ 2.132329 ┆ -0.160845 ┆ -71.074823 ┆ 1 ┆ A │\n", + "│ 3 ┆ 11.261876 ┆ 0.561843 ┆ 4.737346 ┆ 1.611832 ┆ -300.312419 ┆ 2 ┆ A │\n", + "│ 4 ┆ 7.955185 ┆ 0.223838 ┆ 1.977746 ┆ 0.272536 ┆ -398.71787 ┆ 1 ┆ A │\n", + "│ 6 ┆ 3.576043 ┆ 0.148358 ┆ 0.240111 ┆ -1.077243 ┆ -1401.035922 ┆ 0 ┆ A │\n", "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", - "│ 99994 ┆ 7.536122 ┆ 0.11414 ┆ 2.847801 ┆ -0.916853 ┆ -1340.111513 ┆ 2 ┆ C │\n", - "│ 99996 ┆ 10.030577 ┆ 0.939568 ┆ 0.987719 ┆ 0.701578 ┆ -768.062655 ┆ 0 ┆ C │\n", - "│ 99997 ┆ 5.118598 ┆ 0.552395 ┆ 2.390273 ┆ -2.57956 ┆ -1076.610099 ┆ 0 ┆ C │\n", - "│ 99998 ┆ 5.701428 ┆ 0.521572 ┆ 1.290974 ┆ -1.361779 ┆ 5.278061 ┆ 1 ┆ C │\n", - "│ 99999 ┆ 7.946039 ┆ 0.225155 ┆ 2.564999 ┆ 0.367505 ┆ -1021.479937 ┆ 1 ┆ C │\n", + "│ 99990 ┆ 1.210644 ┆ 0.245434 ┆ 2.107333 ┆ -0.943427 ┆ -105.682645 ┆ 0 ┆ C │\n", + "│ 99994 ┆ 7.409102 ┆ 0.831871 ┆ 1.337031 ┆ 0.203354 ┆ -616.625619 ┆ 0 ┆ C │\n", + "│ 99995 ┆ 2.374191 ┆ 0.994781 ┆ 0.468766 ┆ -1.096312 ┆ 579.203078 ┆ 0 ┆ C │\n", + "│ 99996 ┆ 9.601129 ┆ 0.718286 ┆ 1.247733 ┆ 1.518555 ┆ 1059.535828 ┆ 0 ┆ C │\n", + "│ 99997 ┆ 4.247473 ┆ 0.057653 ┆ 0.554583 ┆ 1.383503 ┆ -925.246072 ┆ 0 ┆ C │\n", "└─────────┴───────────┴───────────┴──────────┴───────────┴──────────────┴───────┴──────────┘" ] }, @@ -142,7 +142,7 @@ ], "source": [ "# Random Sample\n", - "sa.sample(df, 0.6) # by ratio" + "ss.sample(df, 0.6) # by ratio" ] }, { @@ -160,7 +160,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (30_000, 8)
row_numuniform_1uniform_2expnormalfat_normalflagscategory
i64f64f64f64f64f64i32str
109.7816230.5638684.4885530.1231011628.8184961"A"
114.5083280.5946973.8777570.849688-1242.376971"A"
141.7023380.7763051.3469870.481826-403.302142"A"
1911.8972340.550351.7914770.861923641.5327762"A"
224.0775150.7377171.0932351.0484441269.1830712"A"
999895.260120.4790690.748342-0.224175-84.2662241"C"
999947.5361220.114142.847801-0.916853-1340.1115132"C"
9999510.4906820.6116920.384882-0.474915157.0110962"C"
9999610.0305770.9395680.9877190.701578-768.0626550"C"
999985.7014280.5215721.290974-1.3617795.2780611"C"
" + "shape: (30_000, 8)
row_numuniform_1uniform_2expnormalfat_normalflagscategory
i64f64f64f64f64f64i32str
12.7756940.6902482.850941-1.966617-143.7885060"A"
22.0496790.5575662.132329-0.160845-71.0748231"A"
51.69690.0332051.5380251.208673-445.6912160"A"
78.9973350.8625420.4427381.1435121912.5424721"A"
96.1044260.9530884.059342-1.483365-1665.7200520"A"
999901.2106440.2454342.107333-0.943427-105.6826450"C"
999952.3741910.9947810.468766-1.096312579.2030780"C"
999974.2474730.0576530.5545831.383503-925.2460720"C"
999981.480790.9935683.6799751.2700191459.7755581"C"
999995.1029520.2684111.892591.530195225.9418442"C"
" ], "text/plain": [ "shape: (30_000, 8)\n", @@ -169,17 +169,17 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ i32 ┆ str │\n", "╞═════════╪═══════════╪═══════════╪══════════╪═══════════╪══════════════╪═══════╪══════════╡\n", - "│ 10 ┆ 9.781623 ┆ 0.563868 ┆ 4.488553 ┆ 0.123101 ┆ 1628.818496 ┆ 1 ┆ A │\n", - "│ 11 ┆ 4.508328 ┆ 0.594697 ┆ 3.877757 ┆ 0.849688 ┆ -1242.37697 ┆ 1 ┆ A │\n", - "│ 14 ┆ 1.702338 ┆ 0.776305 ┆ 1.346987 ┆ 0.481826 ┆ -403.30214 ┆ 2 ┆ A │\n", - "│ 19 ┆ 11.897234 ┆ 0.55035 ┆ 1.791477 ┆ 0.861923 ┆ 641.532776 ┆ 2 ┆ A │\n", - "│ 22 ┆ 4.077515 ┆ 0.737717 ┆ 1.093235 ┆ 1.048444 ┆ 1269.183071 ┆ 2 ┆ A │\n", + "│ 1 ┆ 2.775694 ┆ 0.690248 ┆ 2.850941 ┆ -1.966617 ┆ -143.788506 ┆ 0 ┆ A │\n", + "│ 2 ┆ 2.049679 ┆ 0.557566 ┆ 2.132329 ┆ -0.160845 ┆ -71.074823 ┆ 1 ┆ A │\n", + "│ 5 ┆ 1.6969 ┆ 0.033205 ┆ 1.538025 ┆ 1.208673 ┆ -445.691216 ┆ 0 ┆ A │\n", + "│ 7 ┆ 8.997335 ┆ 0.862542 ┆ 0.442738 ┆ 1.143512 ┆ 1912.542472 ┆ 1 ┆ A │\n", + "│ 9 ┆ 6.104426 ┆ 0.953088 ┆ 4.059342 ┆ -1.483365 ┆ -1665.720052 ┆ 0 ┆ A │\n", "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", - "│ 99989 ┆ 5.26012 ┆ 0.479069 ┆ 0.748342 ┆ -0.224175 ┆ -84.266224 ┆ 1 ┆ C │\n", - "│ 99994 ┆ 7.536122 ┆ 0.11414 ┆ 2.847801 ┆ -0.916853 ┆ -1340.111513 ┆ 2 ┆ C │\n", - "│ 99995 ┆ 10.490682 ┆ 0.611692 ┆ 0.384882 ┆ -0.474915 ┆ 157.011096 ┆ 2 ┆ C │\n", - "│ 99996 ┆ 10.030577 ┆ 0.939568 ┆ 0.987719 ┆ 0.701578 ┆ -768.062655 ┆ 0 ┆ C │\n", - "│ 99998 ┆ 5.701428 ┆ 0.521572 ┆ 1.290974 ┆ -1.361779 ┆ 5.278061 ┆ 1 ┆ C │\n", + "│ 99990 ┆ 1.210644 ┆ 0.245434 ┆ 2.107333 ┆ -0.943427 ┆ -105.682645 ┆ 0 ┆ C │\n", + "│ 99995 ┆ 2.374191 ┆ 0.994781 ┆ 0.468766 ┆ -1.096312 ┆ 579.203078 ┆ 0 ┆ C │\n", + "│ 99997 ┆ 4.247473 ┆ 0.057653 ┆ 0.554583 ┆ 1.383503 ┆ -925.246072 ┆ 0 ┆ C │\n", + "│ 99998 ┆ 1.48079 ┆ 0.993568 ┆ 3.679975 ┆ 1.270019 ┆ 1459.775558 ┆ 1 ┆ C │\n", + "│ 99999 ┆ 5.102952 ┆ 0.268411 ┆ 1.89259 ┆ 1.530195 ┆ 225.941844 ┆ 2 ┆ C │\n", "└─────────┴───────────┴───────────┴──────────┴───────────┴──────────────┴───────┴──────────┘" ] }, @@ -189,7 +189,7 @@ } ], "source": [ - "sa.sample(df, 30_000) # by count" + "ss.sample(df, 30_000) # by count" ] }, { @@ -207,7 +207,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (3, 2)
flagslen
i32u32
033381
133169
233450
" + "shape: (3, 2)
flagslen
i32u32
033569
133084
233347
" ], "text/plain": [ "shape: (3, 2)\n", @@ -216,9 +216,9 @@ "│ --- ┆ --- │\n", "│ i32 ┆ u32 │\n", "╞═══════╪═══════╡\n", - "│ 0 ┆ 33381 │\n", - "│ 1 ┆ 33169 │\n", - "│ 2 ┆ 33450 │\n", + "│ 0 ┆ 33569 │\n", + "│ 1 ┆ 33084 │\n", + "│ 2 ┆ 33347 │\n", "└───────┴───────┘" ] }, @@ -246,7 +246,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (3, 2)
flagslen
i32u32
016690
133169
233450
" + "shape: (3, 2)
flagslen
i32u32
016784
133084
233347
" ], "text/plain": [ "shape: (3, 2)\n", @@ -255,9 +255,9 @@ "│ --- ┆ --- │\n", "│ i32 ┆ u32 │\n", "╞═══════╪═══════╡\n", - "│ 0 ┆ 16690 │\n", - "│ 1 ┆ 33169 │\n", - "│ 2 ┆ 33450 │\n", + "│ 0 ┆ 16784 │\n", + "│ 1 ┆ 33084 │\n", + "│ 2 ┆ 33347 │\n", "└───────┴───────┘" ] }, @@ -268,7 +268,7 @@ ], "source": [ "# Downsample on one group\n", - "sa1 = sa.downsample(\n", + "sa1 = ss.downsample(\n", " df, \n", " (pl.col(\"flags\") == 0, 0.5)\n", ")\n", @@ -290,7 +290,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (3, 2)
flagslen
i32u32
016690
19950
213380
" + "shape: (3, 2)
flagslen
i32u32
016784
19925
213338
" ], "text/plain": [ "shape: (3, 2)\n", @@ -299,9 +299,9 @@ "│ --- ┆ --- │\n", "│ i32 ┆ u32 │\n", "╞═══════╪═══════╡\n", - "│ 0 ┆ 16690 │\n", - "│ 1 ┆ 9950 │\n", - "│ 2 ┆ 13380 │\n", + "│ 0 ┆ 16784 │\n", + "│ 1 ┆ 9925 │\n", + "│ 2 ┆ 13338 │\n", "└───────┴───────┘" ] }, @@ -312,7 +312,7 @@ ], "source": [ "# Downsample on multiple groups\n", - "sa2 = sa.downsample(\n", + "sa2 = ss.downsample(\n", " df, \n", " (pl.col(\"flags\") == 0, 0.5),\n", " (pl.col(\"flags\") == 1, 0.3),\n", @@ -397,7 +397,7 @@ ], "source": [ "# Volume neutral by each category, will take the greatest possible value so that we get neutral volume.\n", - "vn = sa.volume_neutral(\n", + "vn = ss.volume_neutral(\n", " df,\n", " by = pl.col(\"category\"),\n", ")\n", @@ -441,7 +441,7 @@ ], "source": [ "# Volume neutral (10_000) by each category\n", - "vn = sa.volume_neutral(\n", + "vn = ss.volume_neutral(\n", " df,\n", " by = pl.col(\"category\"),\n", " target_volume = 10_000\n", @@ -464,7 +464,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (3, 2)
categorylen
stru32
"A"10000
"B"4220
"C"5780
" + "shape: (3, 2)
categorylen
stru32
"A"10000
"B"4219
"C"5781
" ], "text/plain": [ "shape: (3, 2)\n", @@ -474,8 +474,8 @@ "│ str ┆ u32 │\n", "╞══════════╪═══════╡\n", "│ A ┆ 10000 │\n", - "│ B ┆ 4220 │\n", - "│ C ┆ 5780 │\n", + "│ B ┆ 4219 │\n", + "│ C ┆ 5781 │\n", "└──────────┴───────┘" ] }, @@ -486,7 +486,7 @@ ], "source": [ "# Volume neutral (10_000) by a more complicated condition\n", - "vn = sa.volume_neutral(\n", + "vn = ss.volume_neutral(\n", " df,\n", " by = pl.col(\"category\") == \"A\",\n", " target_volume = 10_000\n", @@ -509,7 +509,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (9, 3)
categoryflagslen
stri32u32
"A"09917
"A"19917
"A"29917
"B"09848
"B"19848
"B"29848
"C"013262
"C"113262
"C"213262
" + "shape: (9, 3)
categoryflagslen
stri32u32
"A"09954
"A"19954
"A"29954
"B"09910
"B"19910
"B"29910
"C"013215
"C"113215
"C"213215
" ], "text/plain": [ "shape: (9, 3)\n", @@ -518,15 +518,15 @@ "│ --- ┆ --- ┆ --- │\n", "│ str ┆ i32 ┆ u32 │\n", "╞══════════╪═══════╪═══════╡\n", - "│ A ┆ 0 ┆ 9917 │\n", - "│ A ┆ 1 ┆ 9917 │\n", - "│ A ┆ 2 ┆ 9917 │\n", - "│ B ┆ 0 ┆ 9848 │\n", - "│ B ┆ 1 ┆ 9848 │\n", - "│ B ┆ 2 ┆ 9848 │\n", - "│ C ┆ 0 ┆ 13262 │\n", - "│ C ┆ 1 ┆ 13262 │\n", - "│ C ┆ 2 ┆ 13262 │\n", + "│ A ┆ 0 ┆ 9954 │\n", + "│ A ┆ 1 ┆ 9954 │\n", + "│ A ┆ 2 ┆ 9954 │\n", + "│ B ┆ 0 ┆ 9910 │\n", + "│ B ┆ 1 ┆ 9910 │\n", + "│ B ┆ 2 ┆ 9910 │\n", + "│ C ┆ 0 ┆ 13215 │\n", + "│ C ┆ 1 ┆ 13215 │\n", + "│ C ┆ 2 ┆ 13215 │\n", "└──────────┴───────┴───────┘" ] }, @@ -538,7 +538,7 @@ "source": [ "# Volume neutral sample with a control level. Volume neutral happens under the category level, meaning\n", "# the volume for each flag in each category is neutral.\n", - "vn = sa.volume_neutral(\n", + "vn = ss.volume_neutral(\n", " df,\n", " by = pl.col(\"flags\"),\n", " control = pl.col(\"category\")\n", @@ -561,7 +561,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (9, 3)
categoryflagslen
stri32u32
"A"09917
"A"19917
"A"29917
"B"09848
"B"19848
"B"29848
"C"010000
"C"110000
"C"210000
" + "shape: (9, 3)
categoryflagslen
stri32u32
"A"09954
"A"19954
"A"29954
"B"09910
"B"19910
"B"29910
"C"010000
"C"110000
"C"210000
" ], "text/plain": [ "shape: (9, 3)\n", @@ -570,12 +570,12 @@ "│ --- ┆ --- ┆ --- │\n", "│ str ┆ i32 ┆ u32 │\n", "╞══════════╪═══════╪═══════╡\n", - "│ A ┆ 0 ┆ 9917 │\n", - "│ A ┆ 1 ┆ 9917 │\n", - "│ A ┆ 2 ┆ 9917 │\n", - "│ B ┆ 0 ┆ 9848 │\n", - "│ B ┆ 1 ┆ 9848 │\n", - "│ B ┆ 2 ┆ 9848 │\n", + "│ A ┆ 0 ┆ 9954 │\n", + "│ A ┆ 1 ┆ 9954 │\n", + "│ A ┆ 2 ┆ 9954 │\n", + "│ B ┆ 0 ┆ 9910 │\n", + "│ B ┆ 1 ┆ 9910 │\n", + "│ B ┆ 2 ┆ 9910 │\n", "│ C ┆ 0 ┆ 10000 │\n", "│ C ┆ 1 ┆ 10000 │\n", "│ C ┆ 2 ┆ 10000 │\n", @@ -589,7 +589,7 @@ ], "source": [ "# We may not meet the target volume for all categories.\n", - "vn = sa.volume_neutral(\n", + "vn = ss.volume_neutral(\n", " df,\n", " by = pl.col(\"flags\"),\n", " control = pl.col(\"category\"),\n", @@ -624,7 +624,7 @@ ], "source": [ "print(df.shape)\n", - "train, test = sa.split_by_ratio(\n", + "train, test = ss.split_by_ratio(\n", " df,\n", " split_ratio = 0.6\n", ")\n", @@ -651,7 +651,7 @@ ], "source": [ "print(df.shape)\n", - "for frame in sa.split_by_ratio(df, split_ratio = [0.25, 0.4, 0.10, 0.25]):\n", + "for frame in ss.split_by_ratio(df, split_ratio = [0.25, 0.4, 0.10, 0.25]):\n", " print(frame.shape)" ] }, @@ -679,7 +679,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.7" + "version": "3.13.1" } }, "nbformat": 4, diff --git a/mkdocs.yml b/mkdocs.yml index 03963e67..c647dbd0 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -5,7 +5,7 @@ use_directory_urls: false nav: - Home: index.md -- Diagnosis: dia.md +- Explorative Data Analysis: eda.md - Pipeline: pipeline.md - Sample and Split: sample_and_split.md - Numerical Functions: num.md diff --git a/pyproject.toml b/pyproject.toml index 28986d28..6bb91644 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,13 +17,14 @@ classifiers = [ ] authors = [{ name = "Tianren Qin", email = "tq9695@gmail.com" }] dependencies = [ - "polars >= 0.20.16, !=1.3.0", + "polars >= 1.0.0, !=1.3.0", 'typing-extensions; python_version <= "3.11"', ] keywords = ["polars-extension", "scientific-computing", "data-science"] [project.optional-dependencies] +# "plotly >= 5.5" plot = ["great-tables>=0.9", "graphviz>=0.20", "altair >= 5.4.0", "vegafusion[embed]"] models = ["numpy>=1.16"] compat = ["numpy>=1.16"] @@ -40,5 +41,9 @@ line-length = 100 fix = true src = ["python"] +[tool.ruff.format] +docstring-code-format = true + + [tool.pytest.ini_options] testpaths = ["tests"] diff --git a/python/polars_ds/__init__.py b/python/polars_ds/__init__.py index b1b19c80..1d339fa5 100644 --- a/python/polars_ds/__init__.py +++ b/python/polars_ds/__init__.py @@ -1,14 +1,8 @@ from __future__ import annotations import polars as pl +# Internal dependencies from ._utils import str_to_expr - -from polars_ds.num import * # noqa: F403 -from polars_ds.metrics import * # noqa: F403 -from polars_ds.stats import * # noqa: F403 -from polars_ds.string import * # noqa: F403 -from polars_ds.ts_features import * # noqa: F403 -from polars_ds.expr_knn import * # noqa: F403 -from polars_ds.expr_linear import * # noqa: F403 +from polars_ds.exprs import * __version__ = "0.7.0" diff --git a/python/polars_ds/_utils.py b/python/polars_ds/_utils.py index 97460ebc..ad0ed714 100644 --- a/python/polars_ds/_utils.py +++ b/python/polars_ds/_utils.py @@ -9,9 +9,9 @@ # Only need this _PLUGIN_PATH = Path(__file__).parent -# FLAG FOR v1 polars -_IS_POLARS_V1 = pl.__version__.startswith("1.") +# V1.18 Introduces a Int128 dtype +# _IS_POLARS_V1_18 = pl.__version__.startswith("1.18.") def pl_plugin( *, diff --git a/python/polars_ds/eda/__init__.py b/python/polars_ds/eda/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/polars_ds/diagnosis.py b/python/polars_ds/eda/diagnosis.py similarity index 58% rename from python/polars_ds/diagnosis.py rename to python/polars_ds/eda/diagnosis.py index 2b0a70d2..fb63e250 100644 --- a/python/polars_ds/diagnosis.py +++ b/python/polars_ds/eda/diagnosis.py @@ -7,15 +7,12 @@ from __future__ import annotations -from ._utils import _IS_POLARS_V1 - -if _IS_POLARS_V1: - from polars._typing import IntoExpr -else: - raise ValueError("You must be on Polars >= v1.0.0 to use this module.") - import altair as alt +# import plotly.express as px +# import plotly.graph_objs as go + import polars.selectors as cs +from polars._typing import IntoExpr import polars as pl import graphviz import warnings @@ -24,19 +21,21 @@ from functools import lru_cache from itertools import combinations from great_tables import GT, nanoplot_options - -from . import query_cond_entropy, principal_components, query_r2 -from .typing import CorrMethod, PolarsFrame -from .stats import corr -from .sample_and_split import sample +# Internal dependencies +from polars_ds.exprs.ts_features import query_cond_entropy +from polars_ds.exprs.stats import corr +from polars_ds.typing import CorrMethod, PolarsFrame +from polars_ds.sample_and_split import sample +from .plots import plot_feature, plot_feature_over alt.data_transformers.enable("vegafusion") +__all__ = ["DIA"] # DIA = Data Inspection Assistant / DIAgonsis class DIA: """ - Data Inspection Assistant. Most plots are powered by plotly/great_tables. Plotly may require + Data Inspection Assistant. Most plots are powered by Altair/great_tables. Altair may require additional package downloads. If you cannot import this module, please try: pip install "polars_ds[plot]" @@ -49,99 +48,6 @@ class DIA: # --- Static / Class Methods --- - # Only a static method for convenience. - @staticmethod - def _plot_lin_reg( - df: pl.DataFrame | pl.LazyFrame, - x: str, - target: str, - add_bias: bool = False, - weights: str | None = None, - max_points: int = 20_000, - filter_by: pl.Expr | None = None, - title_comments: str = "", - ) -> alt.Chart: - """ - See the method `plot_lin_reg` - """ - - to_select = [x, target] if weights is None else [x, target, weights] - if filter_by is None: - temp = df.lazy().select(*to_select) - else: - temp = df.lazy().filter(filter_by).select(*to_select) - - actual_title_comments = "" if title_comments == "" else "<" + title_comments + ">" - - xx = pl.col(x) - yy = pl.col(target) - # Although using simple_lin_reg might seem to be able to reduce some code here, - # it adds complexity because of output type and the r2 query. - # A little bit of code dup is reasonable. - if add_bias: - if weights is None: - x_mean = xx.mean() - y_mean = yy.mean() - beta = (xx - x_mean).dot(yy - y_mean) / (xx - x_mean).dot(xx - x_mean) - alpha = y_mean - beta * x_mean - else: - w = pl.col(weights) - w_sum = w.sum() - x_wmean = w.dot(xx) / w_sum - y_wmean = w.dot(yy) / w_sum - beta = w.dot((xx - x_wmean) * (yy - y_wmean)) / (w.dot((xx - x_wmean).pow(2))) - alpha = y_wmean - beta * x_wmean - else: - if weights is None: - beta = xx.dot(yy) / xx.dot(xx) - else: - w = pl.col(weights) - beta = w.dot(xx * yy) / w.dot(xx.pow(2)) - - alpha = pl.lit(0, dtype=pl.Float64) - - beta, alpha, r2 = ( - temp.select( - beta.alias("beta"), - alpha.alias("alpha"), - query_r2(yy, xx * beta + alpha).alias("r2"), - ) - .collect() - .row(0) - ) - - df_need = temp.select( - xx, - yy, - (xx * beta + alpha).alias("y_pred"), - ) - # Sample down. If len(temp) < max_points, all temp will be selected. This sample supports lazy. - df_sampled = sample(df_need, value=max_points) - - if add_bias and alpha > 0: - subtitle = f"y = {beta:.4f} * x + {round(alpha, 4) if add_bias else ''}, r2 = {r2:.4f}" - elif add_bias and alpha < 0: - subtitle = ( - f"y = {beta:.4f} * x - {abs(round(alpha, 4)) if add_bias else ''}, r2 = {r2:.4f}" - ) - else: - subtitle = f"y = {beta:.4f} * x, r2 = {r2:.4f}" - - title = alt.Title( - text=[ - f"Linear Regression: {target} ~ {x} {'+ bias' if add_bias else ''}", - actual_title_comments, - ], - subtitle=subtitle, - align="center", - ) - chart = ( - alt.Chart(df_sampled, title=title) - .mark_point() - .encode(alt.X(x).scale(zero=False), alt.Y(target)) - ) - return chart + chart.mark_line().encode(alt.X(x).scale(zero=False), alt.Y("y_pred")) - # --- Methods --- def __init__(self, df: PolarsFrame): @@ -324,20 +230,17 @@ def numeric_profile( else: return df_final - def plot_null_distribution( + def null_corr( self, subset: IntoExpr | Iterable[IntoExpr] = pl.all(), filter_by: pl.Expr | None = None, - sort: IntoExpr | Iterable[IntoExpr] | None = None, - descending: bool | Sequence[bool] = False, - row_group_size: int = 10_000, - ) -> GT: + ) -> pl.DataFrame: """ - Checks the null percentages per row group. Row groups are consecutive rows grouped by row number, - with each group having len//n_bins number of elements. The height of each bin is the percentage - of nulls in the row group. + Computes the correlation between A is null and B is null for all (A, B) combinations + in the given subset of columns. - This plot shows whether nulls in one feature is correlated with nulls in other features. + If either A or B is all null or all non-null, the null correlation will not be + computed, since the value is not going to be meaningful. Parameters ---------- @@ -345,62 +248,50 @@ def plot_null_distribution( Anything that can be put into a Polars .select statement. Defaults to pl.all() filter_by A boolean expression - sort - Whether to sort the dataframe first by some other expression. - descending - Only used when sort is not none. Sort in descending order. - row_group_size - The number of rows per row group """ cols = self._frame.select(subset).collect_schema().names() if filter_by is None: - frame = self._frame + frame = self._frame.select(pl.col(cols).is_null()).collect() else: - frame = self._frame.filter(filter_by) + frame = self._frame.filter(filter_by).select(pl.col(cols).is_null()).collect() - if sort is not None: - frame = frame.sort(sort, descending=descending) + df_null_cnt = frame.sum() + n = frame.shape[0] - temp = ( - frame.with_row_index(name="row_group") - .group_by((pl.col("row_group") // row_group_size).alias("row_group")) - .agg(pl.col(cols).null_count() / pl.len()) - .sort("row_group") - .select( - pl.col(cols).exclude(["row_group"]).implode(), - ) - .collect() - ) - # Values for plot. The first n are list[f64] used in nanoplot. The rest are overall null rates - percentages = temp.row(0) - temp2 = frame.select(pl.len(), pl.col(cols).null_count() / pl.len()).collect() - row = temp2.row(0) - total = row[0] - null_rates = row[1:] - - null_table = pl.DataFrame( - { - "column": cols, - "percentages in row groups": [{"val": values} for values in percentages], - "null%": null_rates, - "total": total, - } + invalid = set( + c + for c, cnt in zip(df_null_cnt.columns, df_null_cnt.row(0)) + if (cnt == 0 or cnt == n) ) - return ( - GT(null_table, rowname_col="column") - .tab_header(title="Null Distribution") - .tab_stubhead("column") - .fmt_number(columns=["null%"], decimals=5) - .fmt_percent(columns="null%") - .fmt_nanoplot( - columns="percentages in row groups", - plot_type="bar", - options=nanoplot_options(data_bar_fill_color=None), # "red" - ) - ) + xx = [] + yy = [] + for x, y in combinations(cols, 2): + if not (x in invalid or y in invalid): + xx.append(x) + yy.append(y) + + if len(xx) == 0: + return pl.DataFrame({ + "column_1": [], + "column_2": [], + "null_corr": [] + }, schema = { + "column_1": pl.String, + "column_2": pl.String, + "null_corr": pl.Float64, + }) + else: + corrs = frame.select( + pl.corr(x, y).alias(str(i)) for i, (x, y) in enumerate(zip(xx, yy)) + ).row(0) + return pl.DataFrame({ + "column_1": xx, + "column_2": yy, + "null_corr": corrs + }).sort(pl.col("null_corr").abs(), descending=True) def meta(self) -> Dict: """ @@ -688,23 +579,24 @@ def infer_corr(self, method: CorrMethod = "pearson") -> pl.DataFrame: One of ["pearson", "spearman", "xi", "kendall"] """ to_check = self.numerics + self.bools - correlation = ( + + xx = [] + yy = [] + for x, y in combinations(to_check, 2): + xx.append(x) + yy.append(y) + + corrs = ( self._frame.with_columns(pl.col(c).cast(pl.UInt8) for c in self.bools) .select( corr(x, y, method=method).alias(f"{i}") - for i, (x, y) in enumerate(combinations(to_check, 2)) + for i, (x, y) in enumerate(zip(xx, yy)) ) .collect() .row(0) ) - xx = [] - yy = [] - for x, y in combinations(to_check, 2): - xx.append(x) - yy.append(y) - - return pl.DataFrame({"x": xx, "y": yy, "corr": correlation}).sort( + return pl.DataFrame({"x": xx, "y": yy, "corr": corrs}).sort( pl.col("corr").abs(), descending=True ) @@ -757,21 +649,22 @@ def infer_dependency(self, subset: IntoExpr | Iterable[IntoExpr] = pl.all()) -> stacklevel=2, ) + # Construct output + column = [] + by = [] + for x, y in combinations(check, 2): + column.append(x) + by.append(y) + ce = ( self._frame.select( query_cond_entropy(x, y).abs().alias(f"{i}") - for i, (x, y) in enumerate(combinations(check, 2)) + for i, (x, y) in enumerate(zip(column, by)) ) .collect() .row(0) ) - # Construct output - column = [] - by = [] - for x, y in combinations(check, 2): - column.append(x) - by.append(y) out = pl.DataFrame({"column": column, "by": by, "cond_entropy": ce}).sort("cond_entropy") @@ -827,91 +720,13 @@ def plot_dependency( return dot - def plot_lin_reg( - self, - x: str, - target: str, - add_bias: bool = False, - weights: str | None = None, - max_points: int = 20_000, - by: str | None = None, - title_comments: str = "", - filter_by: pl.Expr | None = None, - ) -> alt.Chart | Exception: - """ - Plots the linear regression line between x and target. - - Paramters - --------- - x - The preditive variable - target - The target variable - add_bias - Whether to add bias in the linear regression - weights - Weights for the linear regression - max_points - The max number of points to be displayed. Notice that this only affects the number of points - on the plot. The linear regression will still be fit on the entire dataset. - title_comments - Additional comments to put in the plot title. - by - Create a lstsq plot for each segment in `by`. - filter_by - Additional filter condition to be applied. This will be applied upfront to the entire - dataframe, and then the dataframe will be partitioned by the segments. - This means it is possible to filter out entire segment(s) before plots are drawn. - """ - if by is None: - plot = DIA._plot_lin_reg( - self._frame, - x, - target, - add_bias, - weights, - max_points, - filter_by, - title_comments, - ) - return plot.configure(autosize="pad") - else: - if filter_by is None: - frame = self._frame - else: - frame = self._frame.filter(filter_by) - - plots = [] - for key, df in frame.collect().partition_by(by, as_dict=True).items(): - try: - plot = DIA._plot_lin_reg( - df, - x, - target, - add_bias, - weights, - max_points, - filter_by=None, - title_comments=f"Segment = {key if len(key) > 1 else key[0]}", - ) - plots.append(plot) - except Exception as e: - warnings.warn( - f"Error occured when plotting on segment: {key}\nOriginal Error Message: {e}" - ) - - return alt.vconcat( - *(plot for plot in plots if not isinstance(plot, Exception)) - ).configure(autosize="pad") - - def plot_dist( + def plot_feature( self, feature: str, n_bins: int | None = None, density: bool = False, show_bad_values: bool = True, filter_by: pl.Expr | None = None, - **kwargs, ) -> Tuple[pl.DataFrame, alt.Chart]: """ Plot distribution of the feature with a few statistical details. @@ -926,121 +741,29 @@ def plot_dist( Whether to plot a probability density or not filter_by An extra condition you may want to impose on the underlying dataset - include_null - When by is not null, whether to consider null a segment or not. If true, null values will be - mapped to the name "__null__". The string "__null__" should not exist originally in the column. - This is a workaround to get plotly to recognize null values. - max_rows - - kwargs - Keyword arguments for plotly's histogram function + show_bad_values + Whether to show % of bad (null or inf or nan) values """ - - if n_bins <= 2: - raise ValueError("For plot_dist, `n_bins` must be > 2.") if feature not in self.numerics: raise ValueError("Input feature must be numeric.") - if filter_by is None: - frame_with_filter = self._frame.select(feature) - else: - frame_with_filter = self._frame.select(feature).filter(filter_by) - - frame = frame_with_filter.filter( - pl.all_horizontal(pl.col(feature).is_finite(), pl.col(feature).is_not_null()) - ).collect() - - p5, median, mean, p95, min_, max_ = frame.select( - p5=pl.col(feature).quantile(0.05), - median=pl.col(feature).median(), - mean=pl.col(feature).mean(), - p95=pl.col(feature).quantile(0.95), - min=pl.col(feature).min(), - max=pl.col(feature).max(), - ).row(0) - - # bin computation - range_ = max_ - min_ - recip = 1 / n_bins - cuts = [recip * (i + 0.5) for i in range(1, n_bins + 1)] - cnt, values = ( - frame.select( - ((pl.col(feature) - min_) / range_) - .cut(breaks=cuts, include_breaks=True) - .struct.rename_fields(["brk", "category"]) - .struct.field("brk") - .value_counts(parallel=True) - .sort() - .alias("bins") - ) - .unnest("bins") - .select(cnt=pl.col("count"), values=pl.col("brk") * range_ + min_) - .get_columns() - ) - # histgram plot - df_plot = pl.DataFrame({"counts": cnt, "cuts": values}) - density_str = "density" if density else "counts" - alt_y = alt.Y(f"{density_str}:Q", scale=alt.Scale(domainMin=0)).title(density_str) - if density: - df_plot = df_plot.with_columns(density=pl.col("counts") / pl.col("counts").sum()) - - base = alt.Chart(df_plot, title=f"Distribution for {feature}") - dist_chart = base.mark_bar(size=15).encode( - alt.X("cuts:Q", axis=alt.Axis(tickCount=n_bins // 2, grid=False)), - alt_y, - tooltip=[ - alt.Tooltip("cuts:Q", title="CutValue"), - alt.Tooltip(f"{density_str}:Q", title=density_str), - ], - ) - # stats overlay - df_stats = pl.DataFrame( - {"names": ["p5", "p50", "avg", "p95"], "stats": [p5, median, mean, p95]} + return plot_feature( + df = self._frame.select(feature) if filter_by is None else self._frame.filter(filter_by).select(feature), + feature = feature, + n_bins = n_bins, + density = density, + show_bad_values = show_bad_values ) - stats_base = alt.Chart(df_stats) - stats_chart = stats_base.mark_rule(color="red").encode( - x=alt.X("stats").title(""), - tooltip=[ - alt.Tooltip("names:N", title="Stats"), - alt.Tooltip("stats:Q", title="Value"), - ], - ) - # null, inf, nan percentages bar - if show_bad_values: - bad_pct = ( - frame_with_filter.select( - pl.any_horizontal(pl.col(feature).is_null(), ~pl.col(feature).is_finite()).sum() - / pl.len() - ) - .collect() - .item(0, 0) - ) - - df_bad = pl.DataFrame({"Null/NaN/Inf%": [bad_pct]}) - bad_chart = ( - alt.Chart(df_bad) - .mark_bar(opacity=0.5) - .encode( - alt.X("Null/NaN/Inf%:Q", scale=alt.Scale(domain=[0, 1])), - tooltip=[ - alt.Tooltip("Null/NaN/Inf%:Q", title="Null/NaN/Inf%"), - ], - ) - ) - chart = alt.vconcat(dist_chart + stats_chart, bad_chart) - else: - chart = dist_chart + stats_chart - - return df_plot, chart - - def compare_dist_on_segment( + def plot_feature_over( self, feature: str, - by: IntoExpr, + segment: str, n_bins: int = 30, density: bool = True, filter_by: pl.Expr | None = None, + show_bad_values: bool = True, + include_null_segment: bool = False ) -> alt.Chart: """ Compare the distribution of a feature on a segment. @@ -1049,7 +772,7 @@ def compare_dist_on_segment( ---------- feature A string representing a column name - by + segment The segment. Anything that evaluates to a column that can be casted to string and used as dicrete segments. Null values in this segment column will be mapped to '__null__'. n_bins @@ -1058,142 +781,21 @@ def compare_dist_on_segment( Whether to show a histogram or a density plot filter_by An optional filter. If not none, this will be applied to the entire data upfront before the segmentation. + show_bad_values + Whether to show % of bad (null or inf or nan) values + include_null_segment + Whether to treat null values in the segment column as a segment. """ + if feature not in self.numerics: + raise ValueError("Input feature must be numeric.") - feat, segment = self._frame.select(feature, by).collect_schema().names() - if filter_by is None: - frame = ( - self._frame.filter( - pl.all_horizontal(pl.col(feat).is_not_null(), pl.col(feat).is_finite()) - ) - .select(feat, by) - .collect() - ) - else: - frame = ( - self._frame.filter( - pl.all_horizontal( - pl.col(feat).is_not_null(), pl.col(feat).is_finite(), filter_by - ) - ) - .select(feat, by) - .collect() - ) - - selection = alt.selection_point(fields=[segment], bind="legend") - # Null will be a group in Altair's chart, but it breaks the predicate evaluation, making - # toggling the null group impossible. (This is likely a Altair bug). We - # map nulls to a special string '__null__' to avoid that issue - frame = frame.with_columns(pl.col(segment).cast(pl.String).fill_null(pl.lit("__null__"))) - base = alt.Chart(frame, title=f"Distribution of {feat} on segment {segment}") - if density: - dist_chart = ( - base.transform_density( - feat, - groupby=[segment], - as_=[feat, "density"], - ) - .mark_bar(opacity=0.55, binSpacing=0) - .encode( - alt.X(f"{feat}:Q"), - alt.Y("density:Q", scale=alt.Scale(domainMin=0)).stack(None), - color=f"{segment}:N", - opacity=alt.condition(selection, alt.value(0.55), alt.value(0.0)), - ) - .add_selection(selection) - ) - else: - dist_chart = ( - base.mark_bar(opacity=0.55, binSpacing=0) - .encode( - alt.X(f"{feat}:Q"), - alt.Y("count()", scale=alt.Scale(domainMin=0)).stack(None), - color=f"{segment}:N", - opacity=alt.condition(selection, alt.value(0.55), alt.value(0.0)), - ) - .add_selection(selection) - ) - - df_temp = self._frame if filter_by is None else self._frame.filter(filter_by) - df_bad = ( - df_temp.group_by(by) - .agg(bad_rate=(pl.col(feat).is_null() | (~pl.col(feat).is_finite())).sum() / pl.len()) - .with_columns(pl.col(segment).fill_null(pl.lit("__null__"))) - .collect() - ) - bad_chart = ( - alt.Chart(df_bad) - .mark_bar(opacity=0.5) - .encode( - alt.X("bad_rate:Q", scale=alt.Scale(domain=[0, 1])).title("Null/NaN/Inf%"), - alt.Y(f"{segment}:N"), - color=f"{segment}:N", - tooltip=[ - alt.Tooltip("bad_rate:Q", title="Null/NaN/Inf%"), - ], - ) - ) - return alt.vconcat(dist_chart, bad_chart) - - def plot_pca( - self, - *features: IntoExpr | Iterable[IntoExpr], - by: IntoExpr, - center: bool = True, - dim: int = 2, - filter_by: pl.Expr | None = None, - max_points: int = 10_000, - **kwargs, - ) -> alt.Chart: - """ - Creates a scatter plot based on the reduced dimensions via PCA, and color it by `by`. - - Paramters - --------- - features - Any selection expression for Polars - by - Color the 2-D PCA plot by the values in the column - center - Whether to automatically center the features - dim - Only 2 principal components plot can be done at this moment. - filter_by - A boolean expression - max_points - The max number of points to be displayed. If data > this limit, the data will be sampled. - kwargs - Anything else that will be passed to plotly's scatter function - """ - feats = self._frame.select(features).collect_schema().names() - - if len(feats) < 2: - raise ValueError("You must pass >= 2 features.") - if dim != 2: - raise NotImplementedError - # if dim < 2 or dim > 3: - # raise ValueError("Input `dim` must either be 2 or 3.") - - if filter_by is None: - frame = self._frame - else: - frame = self._frame.filter(filter_by) - - temp = frame.select(principal_components(*feats, center=center, k=dim).alias("pc"), by) - df = sample(temp, value=max_points).unnest("pc") - - if dim == 2: - selection = alt.selection_point(fields=[by], bind="legend") - return ( - alt.Chart(df, title="PC2 Plot") - .mark_point() - .encode( - alt.X("pc1:Q"), - alt.Y("pc2:Q"), - alt.Color(f"{by}:N"), - opacity=alt.condition(selection, alt.value(1), alt.value(0.1)), - ) - .add_params(selection) - ) - else: - raise NotImplementedError + frame = self._frame.select(feature, segment) if filter_by is None else self._frame.filter(filter_by).select(feature, segment) + return plot_feature_over( + feature = feature, + segment = segment, + n_bins = n_bins, + density = density, + include_null_segment = include_null_segment, + show_bad_values = show_bad_values, + df = frame, + ) \ No newline at end of file diff --git a/python/polars_ds/eda/plots.py b/python/polars_ds/eda/plots.py new file mode 100644 index 00000000..429625d5 --- /dev/null +++ b/python/polars_ds/eda/plots.py @@ -0,0 +1,507 @@ +from __future__ import annotations + +import polars as pl +import altair as alt +from typing import Iterable, List, Tuple +from polars._typing import IntoExpr +# Internal dependencies +import polars_ds.sample_and_split as sa +from polars_ds import query_r2, principal_components, query_tpr_fpr, integrate_trapz + +alt.data_transformers.enable("vegafusion") + +# Plots should never have a title. Title must be editable by the end user +# Interactivity should only be enabled by the end user + +def plot_feature( + *, + feature: str | pl.Expr | Iterable[float], + n_bins: int | None = None, + density: bool = False, + show_bad_values: bool = True, + df: pl.DataFrame | pl.LazyFrame | None = None, +) -> Tuple[pl.DataFrame, alt.Chart]: + """ + Plot distribution of the feature with a few statistical details. + + Parameters + ---------- + df + Either an eager or lazy Polars Dataframe + feature + A string representing a column name + n_bins + The number of bins used for histograms. Not used when the feature column is categorical. + density + Whether to plot a probability density or not + show_bad_values + Whether to show % of bad (null or inf or nan) values + """ + # include_null + # When by is not null, whether to consider null a segment or not. If true, null values will be + # mapped to the name "__null__". The string "__null__" should not exist originally in the column. + # This is a workaround to get plotly to recognize null values. + + if n_bins <= 2: + raise ValueError("Input `n_bins` must be > 2.") + + if isinstance(feature, str): + if df is None: + raise ValueError("If `feature` is str, then df cannot be none.") + feat = feature + data = df.lazy() + elif isinstance(feature, pl.Expr): + if df is None: + raise ValueError("If `feature` is pl.expr, then df cannot be none.") + data = df.lazy() + feat = data.select(feature).collect_schema().names()[0] + else: + feat = "feature" + data = pl.Series(name = "feature", values = feature).to_frame().lazy() + + frame = data.filter( + pl.all_horizontal(pl.col(feat).is_finite(), pl.col(feat).is_not_null()) + ).collect() + + p5, median, mean, p95, min_, max_ = frame.select( + p5=pl.col(feat).quantile(0.05), + median=pl.col(feat).median(), + mean=pl.col(feat).mean(), + p95=pl.col(feat).quantile(0.95), + min=pl.col(feat).min(), + max=pl.col(feat).max(), + ).row(0) + + # bin computation + range_ = max_ - min_ + recip = 1 / n_bins + cuts = [recip * (i + 0.5) for i in range(1, n_bins + 1)] + df_plot = ( + frame.select( + ((pl.col(feat) - min_) / range_) + .cut(breaks=cuts, include_breaks=True) + .struct.rename_fields(["brk", "category"]) + .struct.field("brk") + .value_counts(parallel=True) + .sort() + .alias("bins") + ) + .unnest("bins") + .select(counts=pl.col("count"), cuts=pl.col("brk") * range_ + min_) + ) + # histgram plot + # df_plot = pl.DataFrame({"counts": cnt, "cuts": values}) + density_str = "density" if density else "counts" + alt_y = alt.Y(f"{density_str}:Q", scale=alt.Scale(domainMin=0)).title(density_str) + if density: + df_plot = df_plot.with_columns(density=pl.col("counts") / pl.col("counts").sum()) + + base = alt.Chart(df_plot) + dist_chart = base.mark_bar(size=15).encode( + alt.X("cuts:Q", axis=alt.Axis(tickCount=n_bins // 2, grid=False)), + alt_y, + tooltip=[ + alt.Tooltip("cuts:Q", title="CutValue"), + alt.Tooltip(f"{density_str}:Q", title=density_str), + ], + ) + # stats overlay + df_stats = pl.DataFrame( + {"names": ["p5", "p50", "avg", "p95"], "stats": [p5, median, mean, p95]} + ) + + stats_base = alt.Chart(df_stats) + stats_chart = stats_base.mark_rule(color="#f086ab").encode( + x=alt.X("stats").title(""), + tooltip=[ + alt.Tooltip("names:N", title="Stats"), + alt.Tooltip("stats:Q", title="Value"), + ], + ) + # null, inf, nan percentages bar + if show_bad_values: + bad_pct = ( + data.select( + pl.any_horizontal(pl.col(feat).is_null(), ~pl.col(feat).is_finite()).sum() + / pl.len() + ) + .collect() + .item(0, 0) + ) + + df_bad = pl.DataFrame({"(Null/NaN/Inf)%": [bad_pct]}) + bad_chart = ( + alt.Chart(df_bad) + .mark_bar(opacity=0.5) + .encode( + alt.X("(Null/NaN/Inf)%:Q", scale=alt.Scale(domain=[0, 1])), + tooltip=[ + alt.Tooltip("(Null/NaN/Inf)%:Q", title="(Null/NaN/Inf)%"), + ], + ) + ) + chart = alt.vconcat(dist_chart + stats_chart, bad_chart) + else: + chart = dist_chart + stats_chart + + return df_plot, chart + +def plot_feature_over( + *, + df: pl.DataFrame | pl.LazyFrame, + feature: str, + segment: str, + n_bins: int = 30, + density: bool = True, + show_bad_values: bool = True, + include_null_segment: bool = False, + # segment_null_replacer +) -> alt.Chart: + """ + Compare the distribution of a feature over a segment. + + Parameters + ---------- + df + Either an eager or lazy Polars Dataframe + feature + A string representing a column name + segment + The segment. + n_bins + The max number of bins for the plot. + density + Whether to show a histogram or a density plot + show_bad_values + Whether to show % of bad (null or inf or nan) values + include_null_segment + Whether to treat null values in the segment column as a segment. + """ + if n_bins <= 2: + raise ValueError("Input `n_bins` must be > 2.") + + if not isinstance(segment, str): + raise ValueError("Input `segment` must be a string.") + + if isinstance(feature, str): + feat = feature + data = df.lazy() + elif isinstance(feature, pl.Expr): + data = df.lazy() + feat = data.select(feature).collect_schema().names()[0] + else: + feat = "feature" + data = pl.Series(name = "feature", values = feature).to_frame().lazy() + + if not include_null_segment: + data = data.filter(pl.col(segment).is_not_null()) + + feat, segment = data.select(feature, segment).collect_schema().names() + frame = ( + data.filter( + pl.all_horizontal(pl.col(feat).is_not_null(), pl.col(feat).is_finite()) + ) + .select(feat, pl.col(segment)) + .collect() + ) + + selection = alt.selection_point(fields=[segment], bind="legend") + # Null will be a group in Altair's chart, but it breaks the predicate evaluation, making + # toggling the null group impossible. (This is likely a Altair bug). We can + # map nulls to a special string '__null__' to avoid that issue + # frame = frame.with_columns(pl.col(segment).cast(pl.String).fill_null(pl.lit("__null__"))) + base = alt.Chart(frame) + if density: + dist_chart = ( + base.transform_density( + feat, + groupby=[segment], + as_=[feat, "density"], + ) + .mark_bar(opacity=0.5, binSpacing=0) + .encode( + alt.X(f"{feat}:Q"), + alt.Y("density:Q", scale=alt.Scale(domainMin=0)).stack(None), + color=alt.Color(f"{segment}:N"), # legend=alt.Legend(columns=8) + opacity=alt.condition(selection, alt.value(0.5), alt.value(0.0)), + ) + .add_selection(selection) + ) + else: + dist_chart = ( + base.mark_bar(opacity=0.5, binSpacing=0) + .encode( + alt.X(f"{feat}:Q"), + alt.Y("count()", scale=alt.Scale(domainMin=0)).stack(None), + color=f"{segment}:N", + opacity=alt.condition(selection, alt.value(0.5), alt.value(0.0)), + ) + .add_selection(selection) + ) + + if show_bad_values: + df_bad = ( + data.group_by(segment) + .agg(bad_rate=(pl.col(feat).is_null() | (~pl.col(feat).is_finite())).sum() / pl.len()) + .collect() + # .with_columns(pl.col(segment).fill_null(pl.lit("__null__"))) + ) + bad_chart = ( + alt.Chart(df_bad) + .mark_bar(opacity=0.5) + .encode( + alt.X("bad_rate:Q", scale=alt.Scale(domain=[0, 1])).title("(Null/NaN/Inf)%"), + alt.Y(f"{segment}:N"), + color=f"{segment}:N", + tooltip=[ + alt.Tooltip("bad_rate:Q", title="(Null/NaN/Inf)%"), + ], + ) + ) + return alt.vconcat(dist_chart, bad_chart) + else: + return dist_chart + +def plot_lin_reg( + df: pl.DataFrame | pl.LazyFrame, + x: str, + target: str, + add_bias: bool = False, + weights: str | None = None, + max_points: int = 20_000, + show_lin_reg_eq: bool = True, +) -> alt.Chart: + """ + Plots the linear regression line between x and target. + + Paramters + --------- + df + Either an eager or lazy Polars Dataframe + x + The preditive variable + target + The target variable + add_bias + Whether to add bias in the linear regression + weights + Weights for the linear regression + max_points + The max number of points to be displayed. Notice that this only affects the number of points + on the plot. The linear regression will still be fit on the entire dataset. + show_lin_reg_eq + Whether to show the linear regression equation at the bottom or not + """ + + to_select = [x, target] if weights is None else [x, target, weights] + temp = df.lazy().select(*to_select) + + xx = pl.col(x) + yy = pl.col(target) + # Although using simple_lin_reg might seem to be able to reduce some code here, + # it adds complexity because of output type and the r2 query. + # A little bit of code dup is reasonable. + if add_bias: + if weights is None: + x_mean = xx.mean() + y_mean = yy.mean() + beta = (xx - x_mean).dot(yy - y_mean) / (xx - x_mean).dot(xx - x_mean) + alpha = y_mean - beta * x_mean + else: + w = pl.col(weights) + w_sum = w.sum() + x_wmean = w.dot(xx) / w_sum + y_wmean = w.dot(yy) / w_sum + beta = w.dot((xx - x_wmean) * (yy - y_wmean)) / (w.dot((xx - x_wmean).pow(2))) + alpha = y_wmean - beta * x_wmean + else: + if weights is None: + beta = xx.dot(yy) / xx.dot(xx) + else: + w = pl.col(weights) + beta = w.dot(xx * yy) / w.dot(xx.pow(2)) + + alpha = pl.lit(0, dtype=pl.Float64) + + beta, alpha, r2, length = ( + temp.select( + beta.alias("beta"), + alpha.alias("alpha"), + query_r2(yy, xx * beta + alpha).alias("r2"), + pl.len() + ) + .collect() + .row(0) + ) + + df_need = temp.select( + xx, + yy, + (xx * beta + alpha).alias("y_pred"), + ) + # Sample down if len(temp) > max_points + df_sampled = sa.sample(df_need, value=max_points) if length > max_points else df_need.collect() + + x_title = [x] + if show_lin_reg_eq: + if add_bias and alpha > 0: + reg_info = f"y = {beta:.4f} * x + {round(alpha, 4) if add_bias else ''}, r2 = {r2:.4f}" + elif add_bias and alpha < 0: + reg_info = ( + f"y = {beta:.4f} * x - {abs(round(alpha, 4)) if add_bias else ''}, r2 = {r2:.4f}" + ) + else: + reg_info = f"y = {beta:.4f} * x, r2 = {r2:.4f}" + + x_title.append(reg_info) + + chart = ( + alt.Chart(df_sampled) + .mark_point() + .encode(alt.X(x).scale(zero=False), alt.Y(target)) + ) + return ( + chart + + chart.mark_line().encode( + alt.X(x, title = x_title).scale(zero=False), + alt.Y("y_pred"), + ) + ) + + +def plot_pca( + df: pl.DataFrame | pl.LazyFrame, + features: List[str], + by: IntoExpr, + center: bool = True, + dim: int = 2, + filter_by: pl.Expr | None = None, + max_points: int = 10_000, + **kwargs, +) -> alt.Chart: + """ + Creates a scatter plot based on the reduced dimensions via PCA, and color it by `by`. + + Paramters + --------- + df + Either an eager or lazy Polars Dataframe + features + List of feature names + by + Color the 2-D PCA plot by the values in the column + center + Whether to automatically center the features + dim + Only 2 principal components plot can be done at this moment. + filter_by + A boolean expression + max_points + The max number of points to be displayed. If data > this limit, the data will be sampled. + kwargs + Anything else that will be passed to Altair encode function + """ + if len(features) < 2: + raise ValueError("You must pass >= 2 features.") + if dim not in (2, 3): + raise ValueError("Dim must be 2 or 3.") + + frame = df if filter_by is None else df.filter(filter_by) + + temp = frame.select(principal_components(*features, center=center, k=dim).alias("pc"), by) + df_plot = sa.sample(temp, value=max_points).unnest("pc") + + if dim == 2: + return alt.Chart(df_plot).mark_circle(size=60).encode( + x='pc1', + y='pc2', + color=by, + **kwargs + ) # .interactive() + else: # 3d + raise NotImplementedError + +def plot_roc_auc( + *, + actual: Iterable[int] | str | pl.Expr, + pred: Iterable[float] | str | pl.Expr, + df: pl.DataFrame | pl.LazyFrame | None = None, + show_auc: bool = True, + estimator_name: str = "", + line_color: str = "#92e884", + round_to: int = 4 +) -> alt.Chart: + """ + Plots ROC AUC curve. + + Paramters + --------- + df + Either an eager or lazy Polars Dataframe + actual + A column which has the actual binary target information + pred + The prediction + show_auc + Whether to show the AUC value or not + estimator_name + Name for the estiamtor. Only shown if show_auc is True + line_color + HTML color code + round_to + Round to n-th decimal digit if show_auc is True + """ + # expr_based = isinstance(actual, (str, pl.Expr)) and isinstance(pred, (str, pl.Expr)) and isinstance(df, (pl.DataFrame, pl.LazyFrame)) + if isinstance(actual, (str, pl.Expr)) and isinstance(pred, (str, pl.Expr)) and isinstance(df, (pl.DataFrame, pl.LazyFrame)): + zero = pl.DataFrame({ + "tpr": [0.], + "fpr": [0.], + }, schema = { + "tpr": pl.Float64, + "fpr": pl.Float64, + }) + + tpr_fpr = df.lazy().select( + tpr_fpr = query_tpr_fpr(actual, pred).reverse() + ).unnest("tpr_fpr").select( + "tpr", + "fpr", + ).collect() + df_plot = pl.concat([zero, tpr_fpr]) + + chart = alt.Chart(df_plot).mark_line(interpolate="step", color = line_color).encode( + x=alt.X('fpr', title = "False Positive Rate"), + y=alt.Y('tpr', title = "True Positive Rate"), + ) + if show_auc: + auc = tpr_fpr.select( + integrate_trapz("tpr", "fpr") + ).item(0, 0) + df_text = pl.DataFrame({ + "x": [1.0] + , "y": [0.] + }) + estimator = estimator_name.strip() + auc_text = f"AUC = {round(auc, round_to)}" if estimator == "" else f"{estimator} (AUC = {round(auc, round_to)})" + text = alt.Chart(df_text).mark_point(opacity=0.0).encode( + x = alt.X("x"), + y = alt.Y("y"), + ).mark_text( + dx = -1, + dy = -5, + fontWeight="bold", + text = auc_text, + align="right" + ) + return chart + text + else: + return chart + else: # May fail. User should catch + s1 = pl.Series("actual", values=actual, dtype=pl.UInt32) + s2 = pl.Series("pred", values=pred) + df_temp = pl.DataFrame({ + "actual": s1, + "pred": s2, + }) + return plot_roc_auc(df = df_temp, actual = "actual", pred = "pred", show_auc=show_auc, estimator_name = estimator_name, line_color=line_color, round_to=round_to) + diff --git a/python/polars_ds/exprs/__init__.py b/python/polars_ds/exprs/__init__.py new file mode 100644 index 00000000..205bc6e4 --- /dev/null +++ b/python/polars_ds/exprs/__init__.py @@ -0,0 +1,7 @@ +from .expr_knn import * +from .expr_linear import * +from .metrics import * +from .num import * +from .stats import * +from .string import * +from .ts_features import * \ No newline at end of file diff --git a/python/polars_ds/expr_balltree.py b/python/polars_ds/exprs/expr_balltree.py similarity index 100% rename from python/polars_ds/expr_balltree.py rename to python/polars_ds/exprs/expr_balltree.py diff --git a/python/polars_ds/expr_knn.py b/python/polars_ds/exprs/expr_knn.py similarity index 99% rename from python/polars_ds/expr_knn.py rename to python/polars_ds/exprs/expr_knn.py index 3ea78cc2..6f87dbf0 100644 --- a/python/polars_ds/expr_knn.py +++ b/python/polars_ds/exprs/expr_knn.py @@ -5,8 +5,9 @@ from __future__ import annotations import polars as pl from typing import Iterable, List -from .typing import Distance -from ._utils import pl_plugin, str_to_expr +# Internal dependencies +from polars_ds._utils import pl_plugin, str_to_expr +from polars_ds.typing import Distance __all__ = [ "query_knn_ptwise", diff --git a/python/polars_ds/expr_linear.py b/python/polars_ds/exprs/expr_linear.py similarity index 99% rename from python/polars_ds/expr_linear.py rename to python/polars_ds/exprs/expr_linear.py index 312d1fa9..85e22bed 100644 --- a/python/polars_ds/expr_linear.py +++ b/python/polars_ds/exprs/expr_linear.py @@ -3,9 +3,10 @@ from __future__ import annotations import polars as pl import warnings -from .typing import LRSolverMethods, NullPolicy -from ._utils import pl_plugin from typing import List, Any +# Internal dependencies +from polars_ds.typing import LRSolverMethods, NullPolicy +from polars_ds._utils import pl_plugin __all__ = [ "lin_reg", diff --git a/python/polars_ds/metrics.py b/python/polars_ds/exprs/metrics.py similarity index 98% rename from python/polars_ds/metrics.py rename to python/polars_ds/exprs/metrics.py index e9fd5e20..a3225f91 100644 --- a/python/polars_ds/metrics.py +++ b/python/polars_ds/exprs/metrics.py @@ -3,9 +3,9 @@ from __future__ import annotations import polars as pl - -from ._utils import pl_plugin, str_to_expr -from .typing import MultiAUCStrategy +# Internal dependencies +from polars_ds._utils import pl_plugin, str_to_expr +from polars_ds.typing import MultiAUCStrategy __all__ = [ "query_r2", @@ -352,7 +352,7 @@ def query_roc_auc( Parameters ---------- actual - An expression represeting the actual + An expression represeting the actual. Must be castable to UInt32. pred An expression represeting the column with predicted probability. """ @@ -374,7 +374,7 @@ def query_tpr_fpr( Parameters ---------- actual - An expression represeting the actual + An expression represeting the actual. Must be castable to UInt32. pred An expression represeting the column with predicted probability. """ diff --git a/python/polars_ds/num.py b/python/polars_ds/exprs/num.py similarity index 99% rename from python/polars_ds/num.py rename to python/polars_ds/exprs/num.py index 51cfab26..cfabc7dc 100644 --- a/python/polars_ds/num.py +++ b/python/polars_ds/exprs/num.py @@ -4,12 +4,13 @@ import math import polars as pl from typing import List, Iterable -from .typing import ( +# Internal dependencies +from polars_ds.typing import ( DetrendMethod, ConvMode, ConvMethod, ) -from ._utils import pl_plugin, str_to_expr +from polars_ds._utils import pl_plugin, str_to_expr __all__ = [ "singular_values", diff --git a/python/polars_ds/stats.py b/python/polars_ds/exprs/stats.py similarity index 99% rename from python/polars_ds/stats.py rename to python/polars_ds/exprs/stats.py index 85d5380d..5dd0c2bd 100644 --- a/python/polars_ds/stats.py +++ b/python/polars_ds/exprs/stats.py @@ -4,8 +4,9 @@ import polars as pl import math -from .typing import Alternative, CorrMethod, Noise, QuantileMethod -from ._utils import pl_plugin, str_to_expr +# Internal dependencies +from polars_ds.typing import Alternative, CorrMethod, Noise, QuantileMethod +from polars_ds._utils import pl_plugin, str_to_expr __all__ = [ "ttest_ind", diff --git a/python/polars_ds/string.py b/python/polars_ds/exprs/string.py similarity index 99% rename from python/polars_ds/string.py rename to python/polars_ds/exprs/string.py index 395f8134..a8a00866 100644 --- a/python/polars_ds/string.py +++ b/python/polars_ds/exprs/string.py @@ -4,8 +4,8 @@ import polars as pl from typing import List, Literal, Dict -from ._utils import pl_plugin, str_to_expr - +# Internal dependencies +from polars_ds._utils import pl_plugin, str_to_expr __all__ = [ "filter_by_levenshtein", diff --git a/python/polars_ds/ts_features.py b/python/polars_ds/exprs/ts_features.py similarity index 99% rename from python/polars_ds/ts_features.py rename to python/polars_ds/exprs/ts_features.py index 42cbddf0..219a4022 100644 --- a/python/polars_ds/ts_features.py +++ b/python/polars_ds/exprs/ts_features.py @@ -4,9 +4,10 @@ import math import polars as pl -from .typing import Distance, NullPolicy -from ._utils import pl_plugin, str_to_expr from typing import Iterable, Literal +# Internal dependencies +from polars_ds.typing import Distance, NullPolicy +from polars_ds._utils import pl_plugin, str_to_expr __all__ = [ "query_abs_energy", diff --git a/python/polars_ds/modeling/__init__.py b/python/polars_ds/modeling/__init__.py new file mode 100644 index 00000000..5128d7fa --- /dev/null +++ b/python/polars_ds/modeling/__init__.py @@ -0,0 +1 @@ +from .pipeline import Blueprint, Pipeline, FitStep \ No newline at end of file diff --git a/python/polars_ds/pipeline.py b/python/polars_ds/modeling/pipeline.py similarity index 87% rename from python/polars_ds/pipeline.py rename to python/polars_ds/modeling/pipeline.py index 9a4708cb..f7f61a38 100644 --- a/python/polars_ds/pipeline.py +++ b/python/polars_ds/modeling/pipeline.py @@ -6,12 +6,18 @@ import json import sys import polars.selectors as cs -from . import transforms as t from functools import partial from dataclasses import dataclass from polars.type_aliases import IntoExprColumn from typing import List, Union, Dict, Any, Tuple -from .typing import ( +if sys.version_info >= (3, 11): + from typing import Self +else: # 3.10, 3.9, 3.8 + from typing_extensions import Self + +# Internal Depenedncies +from . import transforms as t +from polars_ds.typing import ( TypeAlias, PolarsFrame, ExprTransform, @@ -22,15 +28,10 @@ EncoderDefaultStrategy, ) -from ._utils import _IS_POLARS_V1 - -if sys.version_info >= (3, 11): - from typing import Self -else: # 3.10, 3.9, 3.8 - from typing_extensions import Self __all__ = ["Pipeline", "Blueprint", "FitStep"] +# Need to refactor and think of a better abstraction for the layers @dataclass class SQLStep: # FittedStep @@ -77,16 +78,11 @@ def fit(self, df: PolarsFrame) -> ExprTransform: if self.cols is None: return self.func(df) else: - if _IS_POLARS_V1: - real_cols: List[str] = [ - x - for x in df.lazy().select(self.cols).collect_schema().names() - if x not in self.exclude - ] - else: - real_cols: List[str] = [ - x for x in df.select(self.cols).columns if x not in self.exclude - ] + real_cols: List[str] = [ + x + for x in df.lazy().select(self.cols).collect_schema().names() + if x not in self.exclude + ] return self.func(df, real_cols) @@ -130,16 +126,10 @@ def _step_to_json(step: FittedStep) -> Dict: if isinstance(step, SQLStep): return {"SQLStep": step.sql} else: - if _IS_POLARS_V1: - try: - exprs = [e.meta.serialize(format="json") for e in step] - except Exception as e: - raise ValueError(f"The `FittedStep` is ill-defined. Original error: \n{e}") - else: - try: - exprs = [e.meta.serialize() for e in step] - except Exception as e: - raise ValueError(f"The `FittedStep` is ill-defined. Original error: \n{e}") + try: + exprs = [e.meta.serialize(format="json") for e in step] + except Exception as e: + raise ValueError(f"The `FittedStep` is ill-defined. Original error: \n{e}") if isinstance(step, SelectStep): return {"SelectStep": exprs} @@ -165,6 +155,8 @@ class Pipeline: transforms: List[FittedStep] ensure_features_in: bool = False ensure_features_out: bool = True + lowercase: bool = False + uppercase: bool = False def __str__(self) -> str: return self.transforms.__str__() @@ -191,7 +183,14 @@ def _generate_lazy_plan(self, df: PolarsFrame) -> pl.LazyFrame: If none, create the plan for the df that the pipe is initialized with. Otherwise, create the plan for the incoming df. """ - plan = df.lazy() + if self.lowercase: + plan = df.lazy().select(pl.all().name.to_lowercase()) + else: + if self.uppercase: + plan = df.lazy().select(pl.all().name.to_uppercase()) + else: + plan = df.lazy() + for step in self.transforms: if isinstance(step, WithColumnsStep): plan = plan.with_columns(step.exprs) @@ -218,6 +217,8 @@ def to_dict(self) -> Dict: "transforms": [_step_to_json(step) for step in self.transforms], "ensure_features_in": self.ensure_features_in, "ensure_features_out": self.ensure_features_out, + "lowercase": self.lowercase, + "uppercase": self.uppercase } def to_json(self, path: str | None = None, **kwargs) -> str | None: @@ -255,6 +256,8 @@ def from_dict(pipeline_dict: Dict[str, Any]) -> Self: feature_names_out_ = pipeline_dict["feature_names_out_"] ensure_features_in = pipeline_dict["ensure_features_in"] ensure_features_out = pipeline_dict["ensure_features_out"] + lowercase = pipeline_dict.get("lowercase", False) + uppercase = pipeline_dict.get("uppercase", False) except Exception as e: raise ValueError(f"Input dictionary is missing keywords. Original error: \n{e}") @@ -263,39 +266,26 @@ def from_dict(pipeline_dict: Dict[str, Any]) -> Self: # each step is a dict like {'SelectStep': [jsonified str expressions..]} for step in transforms: if "SelectStep" in step: - if _IS_POLARS_V1: - json_exprs = step.pop("SelectStep") - actual_exprs = [ - pl.Expr.deserialize(StringIO(e), format="json") for e in json_exprs - ] - transform_steps.append(SelectStep(actual_exprs)) - else: - json_exprs = step.pop("SelectStep") - actual_exprs = [pl.Expr.deserialize(StringIO(e)) for e in json_exprs] - transform_steps.append(SelectStep(actual_exprs)) - + json_exprs = step.pop("SelectStep") + actual_exprs = [ + pl.Expr.deserialize(StringIO(e), format="json") for e in json_exprs + ] + transform_steps.append(SelectStep(actual_exprs)) + elif "SpecialStep" in step: + json_exprs = step.pop("SpecialStep") + transform_steps.append(SpecialStep(json_exprs["SpecialStep"])) elif "WithColumnsStep" in step: - if _IS_POLARS_V1: - json_exprs = step.pop("WithColumnsStep") - actual_exprs = [ - pl.Expr.deserialize(StringIO(e), format="json") for e in json_exprs - ] - transform_steps.append(WithColumnsStep(actual_exprs)) - else: - json_exprs = step.pop("WithColumnsStep") - actual_exprs = [pl.Expr.deserialize(StringIO(e)) for e in json_exprs] - transform_steps.append(WithColumnsStep(actual_exprs)) + json_exprs = step.pop("WithColumnsStep") + actual_exprs = [ + pl.Expr.deserialize(StringIO(e), format="json") for e in json_exprs + ] + transform_steps.append(WithColumnsStep(actual_exprs)) elif "FilterStep" in step: - if _IS_POLARS_V1: - json_exprs = step.pop("FilterStep") - actual_exprs = [ - pl.Expr.deserialize(StringIO(e), format="json") for e in json_exprs - ] - transform_steps.append(FilterStep(actual_exprs)) - else: - json_exprs = step.pop("FilterStep") - actual_exprs = [pl.Expr.deserialize(StringIO(e)) for e in json_exprs] - transform_steps.append(FilterStep(actual_exprs)) + json_exprs = step.pop("FilterStep") + actual_exprs = [ + pl.Expr.deserialize(StringIO(e), format="json") for e in json_exprs + ] + transform_steps.append(FilterStep(actual_exprs)) elif "SQLStep" in step: sql = step.pop("SQLStep") transform_steps.append(SQLStep(sql)) @@ -310,6 +300,8 @@ def from_dict(pipeline_dict: Dict[str, Any]) -> Self: transforms=transform_steps, ensure_features_in=ensure_features_in, ensure_features_out=ensure_features_out, + lowercase=lowercase, + uppercase=uppercase ) def from_json_str(json_str: str) -> Self: @@ -344,7 +336,8 @@ def ensure_features_io(self, ensure_in: bool = True, ensure_out: bool = True) -> self.ensure_features_out = ensure_out return self - def transform(self, + def transform( + self, df: PolarsFrame, return_lazy: bool = False, separate: bool = False, @@ -365,13 +358,9 @@ def transform(self, the target. """ if self.ensure_features_in: - if _IS_POLARS_V1: - columns = df.lazy().collect_schema().names() - extras = [c for c in columns if c not in self.feature_names_in_] - missing = [c for c in self.feature_names_in_ if c not in columns] - else: - extras = [c for c in df.columns if c not in self.feature_names_in_] - missing = [c for c in self.feature_names_in_ if c not in df.columns] + columns = df.lazy().collect_schema().names() + extras = [c for c in columns if c not in self.feature_names_in_] + missing = [c for c in self.feature_names_in_ if c not in columns] if len(extras) > 0 or len(missing) > 0: raise ValueError( f"Input df doesn't have the features expected. Extra columns: {extras}. Missing columns: {missing}" @@ -414,6 +403,8 @@ def __init__( name: str = "test", target: str | None = None, exclude: List[str] | None = None, + lowercase: bool = False, + uppercase: bool = False, ): """ Creates a blueprint object. @@ -433,22 +424,41 @@ def __init__( the exact columns to transform. E.g. when you are using a selector like cs.numeric() for all numeric columns. If this is the case and target is not set nor excluded, then the transformation may be applied to the target as well, which is not desired in most cases. Therefore, it is highly recommended you initialize with target name. + lowercase + Whether to insert a lowercase column name step before all other transformations. + This takes precedence over uppercase. + uppercase + Whether to insert a uppercase column name step before all other transformations. + This only happens if lowercase is False """ self._df: pl.LazyFrame = df.lazy() + if lowercase: + self._df = self._df.select(pl.all().name.to_lowercase()) + else: + if uppercase: + self._df = self._df.select(pl.all().name.to_uppercase()) + self.name: str = str(name) self.target = target - self.feature_names_in_: list[str] = ( - self._df.collect_schema().names() if _IS_POLARS_V1 else list(df.columns) - ) + self.feature_names_in_: list[str] = self._df.collect_schema().names() + self._steps: List[Step] = [] self.exclude: List[str] = [] if target is None else [target] if exclude is not None: # dedup in case user accidentally puts the same column name twice self.exclude = list(set(self.exclude + exclude)) + self.lowercase = lowercase + self.uppercase = uppercase + def __str__(self) -> str: out: str = "" out += f"Blueprint name: {self.name}\n" + if self.lowercase: + out += "Column names: Lowercase all incoming columns." + elif self.uppercase: + out += "Column names: Uppercase all incoming columns." + out += f"Blueprint current steps: {len(self._steps)}\n" out += f"Features Expected: {self.feature_names_in_}\n" return out @@ -562,6 +572,22 @@ def nan_to_null(self) -> Self: self._steps.append(WithColumnsStep(cs.float().nan_to_null())) return self + def int_to_float(self, f32:bool=True) -> Self: + """ + Maps all integer columns to float. + + Parameters + ---------- + f32 + If true, map all integer columns to f32 columns. Otherwise they will be + casted to f64 columns. + """ + if f32: + self._steps.append(WithColumnsStep(cs.integer().cast(pl.Float32))) + else: + self._steps.append(WithColumnsStep(cs.integer().cast(pl.Float64))) + return self + def linear_impute( self, features: IntoExprColumn, target: str | pl.Expr | None = None, add_bias: bool = False ) -> Self: @@ -646,24 +672,28 @@ def select(self, cols: IntoExprColumn) -> Self: self._steps.append(SelectStep(cols)) return self - def shrink_dtype(self, force_f32: bool = False) -> Self: - """ - Shrinks the dtype by calling shrink_dtype on all numerical columns. This may reduce - the memory pressure during the process. + # Not working after pl.Int128 is introduced - Parameters - ---------- - force_f32 - If true, force all float columns to be f32 type. You might want to consider using f32 - only in the pipeline if you wish to save memory. - """ + # def shrink_dtype(self, force_f32: bool = False) -> Self: + # """ + # Shrinks the dtype by calling shrink_dtype on all numerical columns. This may reduce + # the memory pressure during the process. - exprs = cs.integer().shrink_dtype() - self._steps.append(WithColumnsStep(exprs)) - if force_f32: - self._steps.append(WithColumnsStep(cs.float().cast(pl.Float32))) + # Parameters + # ---------- + # force_f32 + # If true, force all float columns to be f32 type. You might want to consider using f32 + # only in the pipeline if you wish to save memory. + # """ + # # The reason for this is that pl.Int128 cannot be pickled yet??? + # exprs = cs.by_dtype( + # pl.Int32, pl.Int8, pl.UInt64, pl.UInt16, pl.UInt8, pl.Int64, pl.Int16, pl.UInt32 + # ).shrink_dtype() + # self._steps.append(WithColumnsStep(exprs)) + # if force_f32: + # self._steps.append(WithColumnsStep(cs.float().cast(pl.Float32))) - return self + # return self def polynomial_features( self, cols: List[str], degree: int, interaction_only: bool = True @@ -749,30 +779,6 @@ def rename(self, rename_dict: Dict[str, str]) -> Self: self._steps.append(WithColumnsStep([pl.col(k).alias(v) for k, v in rename_dict.items()])) return self.drop(old) - def lowercase(self) -> Self: - """ - Lowercases all column names. - """ - if _IS_POLARS_V1: - self._steps.append( - SelectStep([pl.col(c).alias(c.lower()) for c in self._df.collect_schema().names()]) - ) - else: - self._steps.append(SelectStep([pl.col(c).alias(c.lower()) for c in self._df.columns])) - return self - - def uppercase(self) -> Self: - """ - Uppercases all column names. - """ - if _IS_POLARS_V1: - self._steps.append( - SelectStep([pl.col(c).alias(c.upper()) for c in self._df.collect_schema().names()]) - ) - else: - self._steps.append(SelectStep([pl.col(c).alias(c.upper()) for c in self._df.columns])) - return self - def one_hot_encode( self, cols: IntoExprColumn, @@ -1047,20 +1053,21 @@ def materialize(self) -> Pipeline: # the collect should be and optimized. df_lazy: pl.LazyFrame = df.lazy() for step in self._steps: - if isinstance(step, FitStep): # Need fitting - exprs = step.fit(df_lazy) + if isinstance(step, FitStep): # Need fitting + df_temp = df_lazy.collect() + exprs = step.fit(df_temp) transforms.append(WithColumnsStep(exprs)) - df_lazy = df_lazy.with_columns(exprs) - elif isinstance(step, WithColumnsStep): # Fitted + df_lazy = df_temp.lazy().with_columns(exprs) + elif isinstance(step, WithColumnsStep): transforms.append(step) df_lazy = df_lazy.with_columns(step.exprs) - elif isinstance(step, SelectStep): # Fitted + elif isinstance(step, SelectStep): transforms.append(step) df_lazy = df_lazy.select(step.exprs) - elif isinstance(step, FilterStep): # Fitted + elif isinstance(step, FilterStep): transforms.append(step) df_lazy = df_lazy.filter(step.exprs) - elif isinstance(step, SQLStep): # Fitted + elif isinstance(step, SQLStep): transforms.append(step) df_lazy = pl.SQLContext(df=df_lazy).execute(step.sql) else: @@ -1070,10 +1077,10 @@ def materialize(self) -> Pipeline: name=self.name, target=self.target, feature_names_in_=list(self.feature_names_in_), - feature_names_out_=df_lazy.collect_schema().names() - if _IS_POLARS_V1 - else list(df_lazy.columns), + feature_names_out_=df_lazy.collect_schema().names(), transforms=transforms, + lowercase=self.lowercase, + uppercase=self.uppercase ) def fit(self, X=None, y=None) -> Pipeline: diff --git a/python/polars_ds/transforms.py b/python/polars_ds/modeling/transforms.py similarity index 86% rename from python/polars_ds/transforms.py rename to python/polars_ds/modeling/transforms.py index fa6e8647..a84a9f08 100644 --- a/python/polars_ds/transforms.py +++ b/python/polars_ds/modeling/transforms.py @@ -8,7 +8,9 @@ import polars as pl import polars.selectors as cs -from .typing import ( +from typing import List +# Internal dependencies +from polars_ds.typing import ( PolarsFrame, SimpleImputeMethod, SimpleScaleMethod, @@ -16,11 +18,8 @@ QuantileMethod, EncoderDefaultStrategy, ) -from . import num as pds_num -from . import expr_linear as lr -from ._utils import _IS_POLARS_V1 -from typing import List - +import polars_ds.exprs.num as pds_num +import polars_ds.exprs.expr_linear as lr def impute(df: PolarsFrame, cols: List[str], method: SimpleImputeMethod = "mean") -> ExprTransform: """ @@ -111,11 +110,7 @@ def linear_impute( add_bias Whether to add a bias term to the linear regression """ - if _IS_POLARS_V1: - target_name = df.lazy().select(target).collect_schema().names()[0] - else: - target_name = df.select(target).columns[0] - + target_name = df.lazy().select(target).collect_schema().names()[0] features_as_expr = [pl.col(f) for f in features] target_as_expr = pl.col(target_name) temp = ( @@ -362,9 +357,6 @@ def rank_hot_encode( if n_ranks <= 1: raise ValueError("Rank hot encoding does not work with single value ranking.") - if not _IS_POLARS_V1: - raise ValueError("Unavailable for Polars < v1.") - number_rank = list(range(n_ranks)) ranked_expr = pl.col(col).replace_strict( old=ranking, new=number_rank, default=None, return_dtype=pl.Int32 @@ -442,13 +434,9 @@ def target_encode( https://contrib.scikit-learn.org/category_encoders/targetencoder.html """ temp = df.lazy() - if _IS_POLARS_V1: - valid_cols = ( - temp.select(cols).select(cs.string() | cs.categorical()).collect_schema().names() - ) - - else: - valid_cols = temp.select(cols).select(cs.string() | cs.categorical()).columns + valid_cols = ( + temp.select(cols).select(cs.string() | cs.categorical()).collect_schema().names() + ) if len(valid_cols) == 0: raise ValueError( @@ -463,24 +451,13 @@ def target_encode( ).implode() for c in valid_cols ).collect() # add collect config.. - # POLARS_V1 - if _IS_POLARS_V1: - exprs = [ - # c[0] will be a series of struct because of the implode above. - pl.col(c.name).replace_strict( - old=c[0].struct.field("value"), new=c[0].struct.field("to"), default=default_value - ) - for c in temp.get_columns() - ] - else: - exprs = [ - # c[0] will be a series of struct because of the implode above. - pl.col(c.name).replace( - old=c[0].struct.field("value"), new=c[0].struct.field("to"), default=default_value - ) - for c in temp.get_columns() - ] - return exprs + return [ + # c[0] will be a series of struct because of the implode above. + pl.col(c.name).replace_strict( + old=c[0].struct.field("value"), new=c[0].struct.field("to"), default=default_value + ) + for c in temp.get_columns() + ] def woe_encode( @@ -514,12 +491,9 @@ def woe_encode( https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html """ temp = df.lazy() - if _IS_POLARS_V1: - valid_cols = ( - temp.select(cols).select(cs.string() | cs.categorical()).collect_schema().names() - ) - else: - valid_cols = temp.select(cols).select(cs.string() | cs.categorical()).columns + valid_cols = ( + temp.select(cols).select(cs.string() | cs.categorical()).collect_schema().names() + ) if len(valid_cols) == 0: raise ValueError( @@ -531,25 +505,14 @@ def woe_encode( temp = temp.select( pds_num.woe_discrete(c, target).implode() for c in valid_cols ).collect() # add collect config.. - # POLARS_V1 - if _IS_POLARS_V1: - exprs = [ - # c[0] will be a series of struct because of the implode above. - pl.col(c.name).replace_strict( - old=c[0].struct.field("value"), new=c[0].struct.field("woe"), default=default_value - ) - for c in temp.get_columns() - ] - else: - exprs = [ - # c[0] will be a series of struct because of the implode above. - pl.col(c.name).replace( - old=c[0].struct.field("value"), new=c[0].struct.field("woe"), default=default_value - ) - for c in temp.get_columns() - ] - return exprs + return [ + # c[0] will be a series of struct because of the implode above. + pl.col(c.name).replace_strict( + old=c[0].struct.field("value"), new=c[0].struct.field("woe"), default=default_value + ) + for c in temp.get_columns() + ] def iv_encode( @@ -583,12 +546,9 @@ def iv_encode( https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html """ temp = df.lazy() - if _IS_POLARS_V1: - valid_cols = ( - temp.select(cols).select(cs.string() | cs.categorical()).collect_schema().names() - ) - else: - valid_cols = temp.select(cols).select(cs.string() | cs.categorical()).columns + valid_cols = ( + temp.select(cols).select(cs.string() | cs.categorical()).collect_schema().names() + ) if len(valid_cols) == 0: raise ValueError( @@ -601,23 +561,13 @@ def iv_encode( pds_num.info_value_discrete(c, target, return_sum=False).implode() for c in valid_cols ).collect() # add collect config.. # POLARS_V1 - if _IS_POLARS_V1: - exprs = [ - # c[0] will be a series of struct because of the implode above. - pl.col(c.name).replace_strict( - old=c[0].struct.field("value"), new=c[0].struct.field("iv"), default=default_value - ) - for c in temp.get_columns() - ] - else: - exprs = [ - # c[0] will be a series of struct because of the implode above. - pl.col(c.name).replace( - old=c[0].struct.field("value"), new=c[0].struct.field("iv"), default=default_value - ) - for c in temp.get_columns() - ] - return exprs + return [ + # c[0] will be a series of struct because of the implode above. + pl.col(c.name).replace_strict( + old=c[0].struct.field("value"), new=c[0].struct.field("iv"), default=default_value + ) + for c in temp.get_columns() + ] def polynomial_features( diff --git a/python/polars_ds/partition/__init__.py b/python/polars_ds/partition/__init__.py new file mode 100644 index 00000000..b4900e05 --- /dev/null +++ b/python/polars_ds/partition/__init__.py @@ -0,0 +1 @@ +from .partition import PartitionHelper \ No newline at end of file diff --git a/python/polars_ds/partition/partition.py b/python/polars_ds/partition/partition.py new file mode 100644 index 00000000..0d6ec2fb --- /dev/null +++ b/python/polars_ds/partition/partition.py @@ -0,0 +1,94 @@ +from __future__ import annotations + +import polars as pl +import polars.selectors as cs +import warnings +import sys +# Typing +from collections.abc import Callable +from typing import List, Dict, Any +# Internal Dependencies +from polars_ds.typing import PolarsFrame + +class PartitionHelper(): + """ + A transitory convenience class. + """ + + def __init__( + self, + df: PolarsFrame, + by: str | List[str] | None, + separator: str = "|", + whole_df_name: str = "df" + ): + """ + Creates a Partition Result + + Parameters + ---------- + df + Either a Polars dataframe or a Lazyframe + by + Either None, or a string or a list of strings representing column names. If by + is None, the entire df will be considered a partition. + separator + Separator for concatenating the names of different parts, if the partition is done + by multiple columns + whole_df_name + If by is None, the name for the whole df. + """ + if by is None: + self.parts: Dict[str, pl.DataFrame] = {whole_df_name: df.lazy().collect()} + else: + cols = df.select( + (cs.by_name(by)) & (cs.string() | cs.categorical() | cs.boolean()) + ).collect_schema().names() + + all_ok = cols[0] == by if isinstance(by, str) else sorted(cols) == sorted(by) + if not all_ok: + raise ValueError("Currently this only supports partitions by str, bool or categorical columns.") + + self.parts = { + separator.join((str(k) for k in keys)): value + for keys, value in df.lazy().collect().partition_by(by=by, as_dict=True).items() + } + + def __repr__(self) -> str: + output = "" + for part, df in self.parts.items(): + output += f"Paritition: {part}\n" + output += df.__repr__() + "\n" + return output + + def head(self, n:int = 5) -> Dict[str, pl.DataFrame]: + return {k: df.head(n) for k, df in self.parts.items()} + + def names(self) -> List[str]: + return list(self.parts.keys()) + + def get(self, part:str) -> pl.DataFrame | None: + return self.parts.get(part, None) + + def apply(self, func: Callable[[str, pl.DataFrame], Any]) -> Dict[str, Any]: + """ + Apply an arbitrary function to all parts in this partition. + + Parameters + ---------- + func + A function that takes in a str and a pl.DataFrame and outputs anything. The string + represents the name of the segment. Note: this is usually a partial/lambda function with + all other arguments provided. + """ + output = {} + for part, df in self.parts.items(): + try: + output[part] = func(part, df) + except Exception as e: + warnings.warn( + f"An error occured while processing for the part: {part}. This partition is omitted.\nOriginal Error Message: {e}" + , stacklevel = 2 + ) + + return output \ No newline at end of file diff --git a/python/polars_ds/sample_and_split/__init__.py b/python/polars_ds/sample_and_split/__init__.py new file mode 100644 index 00000000..9638c9cd --- /dev/null +++ b/python/polars_ds/sample_and_split/__init__.py @@ -0,0 +1 @@ +from .sample_and_split import * \ No newline at end of file diff --git a/python/polars_ds/sample_and_split.py b/python/polars_ds/sample_and_split/sample_and_split.py similarity index 97% rename from python/polars_ds/sample_and_split.py rename to python/polars_ds/sample_and_split/sample_and_split.py index 795376ec..5a0c40cf 100644 --- a/python/polars_ds/sample_and_split.py +++ b/python/polars_ds/sample_and_split/sample_and_split.py @@ -3,10 +3,18 @@ import polars as pl import random import math -from ._utils import _IS_POLARS_V1 -from .typing import PolarsFrame from typing import List, Tuple from itertools import combinations, islice +# Internal dependency +from polars_ds.typing import PolarsFrame + +__all__ = [ + "sample", + "volume_neutral", + "downsample", + "random_cols", + "split_by_ratio" +] def _sampler_expr(value: float | int, seed: int | None = None) -> pl.Expr: diff --git a/python/polars_ds/typing.py b/python/polars_ds/typing.py index 184337cb..c24d0f4b 100644 --- a/python/polars_ds/typing.py +++ b/python/polars_ds/typing.py @@ -33,12 +33,12 @@ # Need ... FitTransformFunc: TypeAlias = Callable[[PolarsFrame, List[str]], ExprTransform] -# For compatibility -IntoNumpy: TypeAlias = Union["Sequence[float]", "Sequence[int]"] -"""Anything which can be converted to a NumPy numeric array. +# # For compatibility +# IntoNumpy: TypeAlias = Union["Sequence[float]", "Sequence[int]"] +# """Anything which can be converted to a NumPy numeric array. -Examples: - >>> from polars_ds.typing import IntoNumPy - >>> def agnostic_to_numpy(s: IntoNumpy) -> np.ndarray: - ... return s.to_numpy() -""" \ No newline at end of file +# Examples: +# >>> from polars_ds.typing import IntoNumPy +# >>> def agnostic_to_numpy(s: IntoNumpy) -> np.ndarray: +# ... return s.to_numpy() +# """ \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 12f2a5ba..12762a76 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ polars pre-commit ipykernel numpy +# nbformat>=4.2.0 # Need this if we have plotly diff --git a/src/num_ext/tp_fp.rs b/src/num_ext/tp_fp.rs index 0c10136f..6ede595b 100644 --- a/src/num_ext/tp_fp.rs +++ b/src/num_ext/tp_fp.rs @@ -1,3 +1,4 @@ +use faer::reborrow::IntoConst; /// All things true positive, false positive related. /// ROC AUC, Average Precision, precision, recall, etc. m use polars::prelude::*; @@ -125,12 +126,7 @@ fn pl_combo_b(inputs: &[Series]) -> PolarsResult { let y = y.cont_slice()?; // Zero copy let x = x.cont_slice()?; // Zero copy - let auc = if x.len() == 1 { - // x[0] >= 0. Inserting a 0 on the left means we don't need the - sign like the case below - super::trapz::trapz(&[0., y[0]], &[0., x[0]]) - } else { - -super::trapz::trapz(y, x) - }; + let auc = -super::trapz::trapz(y, x); let auc: Series = Series::from_vec("roc_auc".into(), vec![auc]); // Average Precision @@ -163,12 +159,15 @@ fn pl_tpr_fpr(inputs: &[Series]) -> PolarsResult { // actual, when passed in, is always u32 (done in Python extension side) let actual = &inputs[0]; let predicted = &inputs[1]; - let positive_count = actual.sum::().unwrap_or(0); + let positive_cnt = actual.sum::().unwrap_or(0); - if positive_count == 0 { - Ok(Series::from_iter([f64::NAN])) + if positive_cnt == 0 { + let tpr = Series::from_vec("tpr".into(), vec![f64::NAN]); + let fpr = Series::from_vec("fpr".into(), vec![f64::NAN]); + let ca = StructChunked::from_columns("tpr_fpr".into(), 1, &[tpr.into_column(), fpr.into_column()])?; + Ok(ca.into_series()) } else { - let frame = tp_fp_frame(predicted, actual, positive_count, true)? + let frame = tp_fp_frame(predicted, actual, positive_cnt, true)? .select([col("threshold"), col("tpr"), col("fpr")]) .collect()?; @@ -216,13 +215,12 @@ fn pl_roc_auc(inputs: &[Series]) -> PolarsResult { // actual, when passed in, is always u32 (done in Python extension side) let actual = &inputs[0]; let predicted = &inputs[1]; - - let positive_count = actual.sum::().unwrap_or(0); - if positive_count == 0 { + let positive_cnt = actual.sum::().unwrap_or(0); + if positive_cnt == 0 { return Ok(Series::from_iter([f64::NAN])); } - let mut binding = tp_fp_frame(predicted, actual, positive_count, true)? + let mut binding = tp_fp_frame(predicted, actual, positive_cnt, true)? .select([col("tpr"), col("fpr")]) .collect()?; let frame = binding.align_chunks(); @@ -237,13 +235,7 @@ fn pl_roc_auc(inputs: &[Series]) -> PolarsResult { let y = y.cont_slice()?; let x = x.cont_slice()?; - let auc = if x.len() == 1 { - // x[0] >= 0. Inserting a 0 on the left means we don't need the - sign like the case below - super::trapz::trapz(&[0., y[0]], &[0., x[0]]) - } else { - -super::trapz::trapz(y, x) - }; - // let auc: f64 = -super::trapz::trapz(y, x); + let auc = -super::trapz::trapz(y, x); Ok(Series::from_vec("roc_auc".into(), vec![auc])) } diff --git a/src/num_ext/trapz.rs b/src/num_ext/trapz.rs index 78ec8d24..2ac4cb58 100644 --- a/src/num_ext/trapz.rs +++ b/src/num_ext/trapz.rs @@ -5,11 +5,16 @@ use pyo3_polars::derive::polars_expr; #[inline(always)] pub fn trapz(y: &[f64], x: &[f64]) -> f64 { - let mut y_s = vec![0.; y.len() - 1]; - cfavml::add_vector(&y[1..], &y[..y.len() - 1], &mut y_s); - let mut x_d = vec![0.; y.len() - 1]; - cfavml::sub_vector(&x[1..], &x[..x.len() - 1], &mut x_d); - 0.5 * cfavml::dot(&y_s, &x_d) + // x.len() == y.len() checked + if x.len() == 1 && y.len() == 1 { + y[0] * x[0] * -0.5 // y[0] * (-x[0]) * 0.5 + } else { + let mut y_d = vec![0.; y.len() - 1]; + cfavml::add_vector(&y[1..], &y[..y.len() - 1], &mut y_d); + let mut x_d = vec![0.; y.len() - 1]; + cfavml::sub_vector(&x[1..], &x[..x.len() - 1], &mut x_d); + 0.5 * cfavml::dot(&y_d, &x_d) + } } pub fn trapz_dx(y: &[f64], dx: f64) -> f64 { @@ -22,17 +27,13 @@ pub fn trapz_dx(y: &[f64], dx: f64) -> f64 { fn pl_trapz(inputs: &[Series]) -> PolarsResult { let y = inputs[0].f64()?; let x = inputs[1].f64()?; - if y.len() < 2 { + if y.len() < 1 || x.has_nulls() || y.has_nulls() { let ca = Float64Chunked::from_slice("".into(), &[f64::NAN]); return Ok(ca.into_series()); } - if x.has_nulls() || y.has_nulls() { - return Err(PolarsError::ComputeError( - "For trapezoidal integration to work, x and y must not contain nulls.".into(), - )); - } + let y = y.cont_slice()?; - if x.len() == 1 { + if x.len() == 1 && y.len() > 1 { let dx = x.get(0).unwrap(); let ca = Float64Chunked::from_slice("".into(), &[trapz_dx(y, dx)]); Ok(ca.into_series()) diff --git a/tests/test_compat.py b/tests/test_compat.py index 15f6677f..476cee73 100644 --- a/tests/test_compat.py +++ b/tests/test_compat.py @@ -6,13 +6,13 @@ import numpy as np import polars_ds as pds -import polars_ds.expr_linear as pds_linear -import polars_ds.num as pds_num -import polars_ds.string as pds_str -import polars_ds.stats as pds_stats -import polars_ds.ts_features as pds_ts -import polars_ds.expr_knn as pds_knn -import polars_ds.metrics as pds_metrics +import polars_ds.exprs.expr_linear as pds_linear +import polars_ds.exprs.num as pds_num +import polars_ds.exprs.string as pds_str +import polars_ds.exprs.stats as pds_stats +import polars_ds.exprs.ts_features as pds_ts +import polars_ds.exprs.expr_knn as pds_knn +import polars_ds.exprs.metrics as pds_metrics from polars.testing import assert_frame_equal, assert_series_equal from polars_ds.compat import compat as pds2 diff --git a/tests/test_transforms.py b/tests/test_transforms.py index bc293424..724389f8 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -1,6 +1,6 @@ import polars as pl import polars_ds as pds -import polars_ds.transforms as t +import polars_ds.modeling.transforms as t import pytest from polars.testing import assert_frame_equal