diff --git a/docs/pipeline.md b/docs/pipeline.md new file mode 100644 index 00000000..f69c21c6 --- /dev/null +++ b/docs/pipeline.md @@ -0,0 +1,3 @@ +## Polars Native Machine Learning Pipeline + +::: polars_ds.pipeline \ No newline at end of file diff --git a/docs/sample.md b/docs/sample.md new file mode 100644 index 00000000..a6a2b493 --- /dev/null +++ b/docs/sample.md @@ -0,0 +1,3 @@ +## Polars Native Machine Learning Pipeline + +::: polars_ds.sample \ No newline at end of file diff --git a/examples/sample_and_split.ipynb b/examples/sample_and_split.ipynb index a59f765f..3906b586 100644 --- a/examples/sample_and_split.ipynb +++ b/examples/sample_and_split.ipynb @@ -42,7 +42,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 8)
row_numuniform_1uniform_2expnormalfat_normalflagscategory
i64f64f64f64f64f64i32str
01.8649660.9831122.1340980.201208-510.1678171"A"
13.6813650.0726160.24552-2.180395-1230.7970292"A"
27.1381630.0689230.498513-1.737763-914.7574362"A"
39.24110.0061290.670527-1.500905-1025.1443721"A"
47.8749720.6387642.1595890.4902171329.5465350"A"
" + "shape: (5, 8)
row_numuniform_1uniform_2expnormalfat_normalflagscategory
i64f64f64f64f64f64i32str
00.1013010.2412360.068629-1.546608-1820.0649860"A"
17.7637780.6885271.564067-0.2198752842.1289220"A"
26.6921040.3020392.184995-2.038565-107.6932760"A"
38.0697980.5897820.047991-1.02959452.0452071"A"
44.7099250.2899220.5387211.08094711.8071541"A"
" ], "text/plain": [ "shape: (5, 8)\n", @@ -51,11 +51,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ i32 ┆ str │\n", "╞═════════╪═══════════╪═══════════╪══════════╪═══════════╪══════════════╪═══════╪══════════╡\n", - "│ 0 ┆ 1.864966 ┆ 0.983112 ┆ 2.134098 ┆ 0.201208 ┆ -510.167817 ┆ 1 ┆ A │\n", - "│ 1 ┆ 3.681365 ┆ 0.072616 ┆ 0.24552 ┆ -2.180395 ┆ -1230.797029 ┆ 2 ┆ A │\n", - "│ 2 ┆ 7.138163 ┆ 0.068923 ┆ 0.498513 ┆ -1.737763 ┆ -914.757436 ┆ 2 ┆ A │\n", - "│ 3 ┆ 9.2411 ┆ 0.006129 ┆ 0.670527 ┆ -1.500905 ┆ -1025.144372 ┆ 1 ┆ A │\n", - "│ 4 ┆ 7.874972 ┆ 0.638764 ┆ 2.159589 ┆ 0.490217 ┆ 1329.546535 ┆ 0 ┆ A │\n", + "│ 0 ┆ 0.101301 ┆ 0.241236 ┆ 0.068629 ┆ -1.546608 ┆ -1820.064986 ┆ 0 ┆ A │\n", + "│ 1 ┆ 7.763778 ┆ 0.688527 ┆ 1.564067 ┆ -0.219875 ┆ 2842.128922 ┆ 0 ┆ A │\n", + "│ 2 ┆ 6.692104 ┆ 0.302039 ┆ 2.184995 ┆ -2.038565 ┆ -107.693276 ┆ 0 ┆ A │\n", + "│ 3 ┆ 8.069798 ┆ 0.589782 ┆ 0.047991 ┆ -1.029594 ┆ 52.045207 ┆ 1 ┆ A │\n", + "│ 4 ┆ 4.709925 ┆ 0.289922 ┆ 0.538721 ┆ 1.08094 ┆ 711.807154 ┆ 1 ┆ A │\n", "└─────────┴───────────┴───────────┴──────────┴───────────┴──────────────┴───────┴──────────┘" ] }, @@ -85,7 +85,7 @@ { "data": { "text/plain": [ - "['row_num', 'fat_normal', 'flags']" + "['row_num', 'uniform_1', 'fat_normal']" ] }, "execution_count": 3, @@ -112,27 +112,27 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (60_000, 8)
row_numuniform_1uniform_2expnormalfat_normalflagscategory
i64f64f64f64f64f64i32str
01.8649660.9831122.1340980.201208-510.1678171"A"
13.6813650.0726160.24552-2.180395-1230.7970292"A"
27.1381630.0689230.498513-1.737763-914.7574362"A"
47.8749720.6387642.1595890.4902171329.5465350"A"
51.4794640.7036490.480330.550675186.8791461"A"
611.3540590.8817352.399495-0.720839-376.3894662"A"
77.9811270.1998840.335183-0.223638-168.9363822"A"
83.5393160.6187862.968125-0.0664333852.2901150"A"
122.8815490.3066531.8282170.5947231189.4616120"A"
1311.2867610.2005524.8368370.725427-18.0392012"A"
167.8793370.1943511.395224-1.621337253.4880520"A"
176.887810.4368732.564610.799571-573.2290852"A"
999843.1753640.5212142.621609-0.259841-207.3220521"C"
999852.9824970.0469592.013501-1.38318-449.4960940"C"
999869.7154970.1730487.7875870.031314-745.8021711"C"
999870.4641490.0525770.251207-0.550988211.9947151"C"
999882.900020.1407290.390610.420128543.3484450"C"
999897.7259160.6088431.5186040.872585-1046.0551521"C"
999902.5504470.1368281.878146-0.4438941299.1665741"C"
999936.3449320.914680.0776940.667014274.1970441"C"
999957.7511360.9912150.3419380.74724468.0911641"C"
999968.9321530.1493422.0171350.254913-1555.191880"C"
999976.3664580.8251371.149404-0.653029211.2889541"C"
999995.0213820.4058932.7458670.8005251572.6187831"C"
" + "shape: (60_000, 8)
row_numuniform_1uniform_2expnormalfat_normalflagscategory
i64f64f64f64f64f64i32str
26.6921040.3020392.184995-2.038565-107.6932760"A"
38.0697980.5897820.047991-1.02959452.0452071"A"
44.7099250.2899220.5387211.08094711.8071541"A"
79.6797370.4315941.131895-0.7396371269.6139961"A"
810.8032240.5463173.25459-0.904231243.5727332"A"
9999110.253850.6326150.263188-0.388282273.4174952"C"
999935.2133480.1994944.928223-0.3626071729.1267160"C"
999949.4416030.3479070.318096-0.112797242.4572840"C"
999987.3744660.0307313.606166-0.5822651290.9373560"C"
999997.1648530.3997910.354686-0.9998681678.1904051"C"
" ], "text/plain": [ "shape: (60_000, 8)\n", - "┌─────────┬───────────┬───────────┬──────────┬───────────┬──────────────┬───────┬──────────┐\n", - "│ row_num ┆ uniform_1 ┆ uniform_2 ┆ exp ┆ normal ┆ fat_normal ┆ flags ┆ category │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ i32 ┆ str │\n", - "╞═════════╪═══════════╪═══════════╪══════════╪═══════════╪══════════════╪═══════╪══════════╡\n", - "│ 0 ┆ 1.864966 ┆ 0.983112 ┆ 2.134098 ┆ 0.201208 ┆ -510.167817 ┆ 1 ┆ A │\n", - "│ 1 ┆ 3.681365 ┆ 0.072616 ┆ 0.24552 ┆ -2.180395 ┆ -1230.797029 ┆ 2 ┆ A │\n", - "│ 2 ┆ 7.138163 ┆ 0.068923 ┆ 0.498513 ┆ -1.737763 ┆ -914.757436 ┆ 2 ┆ A │\n", - "│ 4 ┆ 7.874972 ┆ 0.638764 ┆ 2.159589 ┆ 0.490217 ┆ 1329.546535 ┆ 0 ┆ A │\n", - "│ 5 ┆ 1.479464 ┆ 0.703649 ┆ 0.48033 ┆ 0.550675 ┆ 186.879146 ┆ 1 ┆ A │\n", - "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", - "│ 99993 ┆ 6.344932 ┆ 0.91468 ┆ 0.077694 ┆ 0.667014 ┆ 274.197044 ┆ 1 ┆ C │\n", - "│ 99995 ┆ 7.751136 ┆ 0.991215 ┆ 0.341938 ┆ 0.74724 ┆ 468.091164 ┆ 1 ┆ C │\n", - "│ 99996 ┆ 8.932153 ┆ 0.149342 ┆ 2.017135 ┆ 0.254913 ┆ -1555.19188 ┆ 0 ┆ C │\n", - "│ 99997 ┆ 6.366458 ┆ 0.825137 ┆ 1.149404 ┆ -0.653029 ┆ 211.288954 ┆ 1 ┆ C │\n", - "│ 99999 ┆ 5.021382 ┆ 0.405893 ┆ 2.745867 ┆ 0.800525 ┆ 1572.618783 ┆ 1 ┆ C │\n", - "└─────────┴───────────┴───────────┴──────────┴───────────┴──────────────┴───────┴──────────┘" + "┌─────────┬───────────┬───────────┬──────────┬───────────┬─────────────┬───────┬──────────┐\n", + "│ row_num ┆ uniform_1 ┆ uniform_2 ┆ exp ┆ normal ┆ fat_normal ┆ flags ┆ category │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ i32 ┆ str │\n", + "╞═════════╪═══════════╪═══════════╪══════════╪═══════════╪═════════════╪═══════╪══════════╡\n", + "│ 2 ┆ 6.692104 ┆ 0.302039 ┆ 2.184995 ┆ -2.038565 ┆ -107.693276 ┆ 0 ┆ A │\n", + "│ 3 ┆ 8.069798 ┆ 0.589782 ┆ 0.047991 ┆ -1.029594 ┆ 52.045207 ┆ 1 ┆ A │\n", + "│ 4 ┆ 4.709925 ┆ 0.289922 ┆ 0.538721 ┆ 1.08094 ┆ 711.807154 ┆ 1 ┆ A │\n", + "│ 7 ┆ 9.679737 ┆ 0.431594 ┆ 1.131895 ┆ -0.739637 ┆ 1269.613996 ┆ 1 ┆ A │\n", + "│ 8 ┆ 10.803224 ┆ 0.546317 ┆ 3.25459 ┆ -0.904231 ┆ 243.572733 ┆ 2 ┆ A │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 99991 ┆ 10.25385 ┆ 0.632615 ┆ 0.263188 ┆ -0.388282 ┆ 273.417495 ┆ 2 ┆ C │\n", + "│ 99993 ┆ 5.213348 ┆ 0.199494 ┆ 4.928223 ┆ -0.362607 ┆ 1729.126716 ┆ 0 ┆ C │\n", + "│ 99994 ┆ 9.441603 ┆ 0.347907 ┆ 0.318096 ┆ -0.112797 ┆ 242.457284 ┆ 0 ┆ C │\n", + "│ 99998 ┆ 7.374466 ┆ 0.030731 ┆ 3.606166 ┆ -0.582265 ┆ 1290.937356 ┆ 0 ┆ C │\n", + "│ 99999 ┆ 7.164853 ┆ 0.399791 ┆ 0.354686 ┆ -0.999868 ┆ 1678.190405 ┆ 1 ┆ C │\n", + "└─────────┴───────────┴───────────┴──────────┴───────────┴─────────────┴───────┴──────────┘" ] }, "execution_count": 4, @@ -160,27 +160,27 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (30_000, 8)
row_numuniform_1uniform_2expnormalfat_normalflagscategory
i64f64f64f64f64f64i32str
47.8749720.6387642.1595890.4902171329.5465350"A"
51.4794640.7036490.480330.550675186.8791461"A"
611.3540590.8817352.399495-0.720839-376.3894662"A"
83.5393160.6187862.968125-0.0664333852.2901150"A"
122.8815490.3066531.8282170.5947231189.4616120"A"
1311.2867610.2005524.8368370.725427-18.0392012"A"
222.8331850.0439573.999065-0.735753357.5254222"A"
2511.3591160.1911830.7038920.195382-1028.0611740"A"
276.2765640.8925710.7340940.648778-1995.4656520"A"
3211.9591820.6652872.5746191.856659-982.2711821"A"
346.4937560.9473861.7455730.261902598.1277350"A"
419.5498290.698022.068939-1.50605357.8841842"A"
999658.6122620.6928543.4557560.779895433.9661472"C"
999663.3432890.4870113.6568450.91173870.3138381"C"
999684.459780.0797413.194479-0.210791-843.7184072"C"
999719.7119850.0500490.0780041.655786-1371.0707081"C"
999721.0474840.8086190.3478940.834854-1020.647332"C"
999778.5894070.5357410.1315771.839748-104.3721551"C"
999809.5130810.9587271.1973242.042775-1971.0612721"C"
999822.0480970.195731.6077110.141-525.7571091"C"
999843.1753640.5212142.621609-0.259841-207.3220521"C"
999910.1558650.9388761.293072-3.370048677.1274532"C"
999957.7511360.9912150.3419380.74724468.0911641"C"
999976.3664580.8251371.149404-0.653029211.2889541"C"
" + "shape: (30_000, 8)
row_numuniform_1uniform_2expnormalfat_normalflagscategory
i64f64f64f64f64f64i32str
143.290520.627660.684437-0.1146061440.3249710"A"
1510.0338490.5558320.2656981.894409-2266.656471"A"
1810.2090920.4114131.7813091.50218163.5215151"A"
235.4619890.6312792.069071-0.735686-1463.488850"A"
278.9022780.7325492.877614-0.049382-1301.7452180"A"
999746.5416710.0520392.622168-0.451139-1270.9593642"C"
999761.1476420.8852230.7041890.61381344.8616591"C"
999857.3210520.764916.039978-0.407754304.3400421"C"
9999110.253850.6326150.263188-0.388282273.4174952"C"
999951.9484280.9232936.168104-0.151161997.1595140"C"
" ], "text/plain": [ "shape: (30_000, 8)\n", - "┌─────────┬───────────┬───────────┬──────────┬───────────┬─────────────┬───────┬──────────┐\n", - "│ row_num ┆ uniform_1 ┆ uniform_2 ┆ exp ┆ normal ┆ fat_normal ┆ flags ┆ category │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ i32 ┆ str │\n", - "╞═════════╪═══════════╪═══════════╪══════════╪═══════════╪═════════════╪═══════╪══════════╡\n", - "│ 4 ┆ 7.874972 ┆ 0.638764 ┆ 2.159589 ┆ 0.490217 ┆ 1329.546535 ┆ 0 ┆ A │\n", - "│ 5 ┆ 1.479464 ┆ 0.703649 ┆ 0.48033 ┆ 0.550675 ┆ 186.879146 ┆ 1 ┆ A │\n", - "│ 6 ┆ 11.354059 ┆ 0.881735 ┆ 2.399495 ┆ -0.720839 ┆ -376.389466 ┆ 2 ┆ A │\n", - "│ 8 ┆ 3.539316 ┆ 0.618786 ┆ 2.968125 ┆ -0.066433 ┆ 3852.290115 ┆ 0 ┆ A │\n", - "│ 12 ┆ 2.881549 ┆ 0.306653 ┆ 1.828217 ┆ 0.594723 ┆ 1189.461612 ┆ 0 ┆ A │\n", - "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", - "│ 99982 ┆ 2.048097 ┆ 0.19573 ┆ 1.607711 ┆ 0.141 ┆ -525.757109 ┆ 1 ┆ C │\n", - "│ 99984 ┆ 3.175364 ┆ 0.521214 ┆ 2.621609 ┆ -0.259841 ┆ -207.322052 ┆ 1 ┆ C │\n", - "│ 99991 ┆ 0.155865 ┆ 0.938876 ┆ 1.293072 ┆ -3.370048 ┆ 677.127453 ┆ 2 ┆ C │\n", - "│ 99995 ┆ 7.751136 ┆ 0.991215 ┆ 0.341938 ┆ 0.74724 ┆ 468.091164 ┆ 1 ┆ C │\n", - "│ 99997 ┆ 6.366458 ┆ 0.825137 ┆ 1.149404 ┆ -0.653029 ┆ 211.288954 ┆ 1 ┆ C │\n", - "└─────────┴───────────┴───────────┴──────────┴───────────┴─────────────┴───────┴──────────┘" + "┌─────────┬───────────┬───────────┬──────────┬───────────┬──────────────┬───────┬──────────┐\n", + "│ row_num ┆ uniform_1 ┆ uniform_2 ┆ exp ┆ normal ┆ fat_normal ┆ flags ┆ category │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ i32 ┆ str │\n", + "╞═════════╪═══════════╪═══════════╪══════════╪═══════════╪══════════════╪═══════╪══════════╡\n", + "│ 14 ┆ 3.29052 ┆ 0.62766 ┆ 0.684437 ┆ -0.114606 ┆ 1440.324971 ┆ 0 ┆ A │\n", + "│ 15 ┆ 10.033849 ┆ 0.555832 ┆ 0.265698 ┆ 1.894409 ┆ -2266.65647 ┆ 1 ┆ A │\n", + "│ 18 ┆ 10.209092 ┆ 0.411413 ┆ 1.781309 ┆ 1.502181 ┆ 63.521515 ┆ 1 ┆ A │\n", + "│ 23 ┆ 5.461989 ┆ 0.631279 ┆ 2.069071 ┆ -0.735686 ┆ -1463.48885 ┆ 0 ┆ A │\n", + "│ 27 ┆ 8.902278 ┆ 0.732549 ┆ 2.877614 ┆ -0.049382 ┆ -1301.745218 ┆ 0 ┆ A │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 99974 ┆ 6.541671 ┆ 0.052039 ┆ 2.622168 ┆ -0.451139 ┆ -1270.959364 ┆ 2 ┆ C │\n", + "│ 99976 ┆ 1.147642 ┆ 0.885223 ┆ 0.704189 ┆ 0.61381 ┆ 344.861659 ┆ 1 ┆ C │\n", + "│ 99985 ┆ 7.321052 ┆ 0.76491 ┆ 6.039978 ┆ -0.407754 ┆ 304.340042 ┆ 1 ┆ C │\n", + "│ 99991 ┆ 10.25385 ┆ 0.632615 ┆ 0.263188 ┆ -0.388282 ┆ 273.417495 ┆ 2 ┆ C │\n", + "│ 99995 ┆ 1.948428 ┆ 0.923293 ┆ 6.168104 ┆ -0.151161 ┆ 997.159514 ┆ 0 ┆ C │\n", + "└─────────┴───────────┴───────────┴──────────┴───────────┴──────────────┴───────┴──────────┘" ] }, "execution_count": 5, @@ -207,7 +207,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (3, 2)
flagslen
i32u32
033281
133469
233250
" + "shape: (3, 2)
flagslen
i32u32
033282
133654
233064
" ], "text/plain": [ "shape: (3, 2)\n", @@ -216,9 +216,9 @@ "│ --- ┆ --- │\n", "│ i32 ┆ u32 │\n", "╞═══════╪═══════╡\n", - "│ 0 ┆ 33281 │\n", - "│ 1 ┆ 33469 │\n", - "│ 2 ┆ 33250 │\n", + "│ 0 ┆ 33282 │\n", + "│ 1 ┆ 33654 │\n", + "│ 2 ┆ 33064 │\n", "└───────┴───────┘" ] }, @@ -246,7 +246,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (3, 2)
flagslen
i32u32
016641
133469
233250
" + "shape: (3, 2)
flagslen
i32u32
016641
133654
233064
" ], "text/plain": [ "shape: (3, 2)\n", @@ -256,8 +256,8 @@ "│ i32 ┆ u32 │\n", "╞═══════╪═══════╡\n", "│ 0 ┆ 16641 │\n", - "│ 1 ┆ 33469 │\n", - "│ 2 ┆ 33250 │\n", + "│ 1 ┆ 33654 │\n", + "│ 2 ┆ 33064 │\n", "└───────┴───────┘" ] }, @@ -290,7 +290,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (3, 2)
flagslen
i32u32
016641
110041
213300
" + "shape: (3, 2)
flagslen
i32u32
016641
110097
213226
" ], "text/plain": [ "shape: (3, 2)\n", @@ -300,8 +300,8 @@ "│ i32 ┆ u32 │\n", "╞═══════╪═══════╡\n", "│ 0 ┆ 16641 │\n", - "│ 1 ┆ 10041 │\n", - "│ 2 ┆ 13300 │\n", + "│ 1 ┆ 10097 │\n", + "│ 2 ┆ 13226 │\n", "└───────┴───────┘" ] }, @@ -466,7 +466,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (3, 2)
categorylen
stru32
"A"10000
"B"4249
"C"5751
" + "shape: (3, 2)
categorylen
stru32
"A"10000
"B"4302
"C"5698
" ], "text/plain": [ "shape: (3, 2)\n", @@ -476,8 +476,8 @@ "│ str ┆ u32 │\n", "╞══════════╪═══════╡\n", "│ A ┆ 10000 │\n", - "│ B ┆ 4249 │\n", - "│ C ┆ 5751 │\n", + "│ B ┆ 4302 │\n", + "│ C ┆ 5698 │\n", "└──────────┴───────┘" ] }, @@ -511,7 +511,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (9, 3)
categoryflagslen
stri32u32
"A"09996
"A"19996
"A"29996
"B"09970
"B"19970
"B"29970
"C"013251
"C"113251
"C"213251
" + "shape: (9, 3)
categoryflagslen
stri32u32
"A"09865
"A"19865
"A"29865
"B"09909
"B"19909
"B"29909
"C"013224
"C"113224
"C"213224
" ], "text/plain": [ "shape: (9, 3)\n", @@ -520,15 +520,15 @@ "│ --- ┆ --- ┆ --- │\n", "│ str ┆ i32 ┆ u32 │\n", "╞══════════╪═══════╪═══════╡\n", - "│ A ┆ 0 ┆ 9996 │\n", - "│ A ┆ 1 ┆ 9996 │\n", - "│ A ┆ 2 ┆ 9996 │\n", - "│ B ┆ 0 ┆ 9970 │\n", - "│ B ┆ 1 ┆ 9970 │\n", - "│ B ┆ 2 ┆ 9970 │\n", - "│ C ┆ 0 ┆ 13251 │\n", - "│ C ┆ 1 ┆ 13251 │\n", - "│ C ┆ 2 ┆ 13251 │\n", + "│ A ┆ 0 ┆ 9865 │\n", + "│ A ┆ 1 ┆ 9865 │\n", + "│ A ┆ 2 ┆ 9865 │\n", + "│ B ┆ 0 ┆ 9909 │\n", + "│ B ┆ 1 ┆ 9909 │\n", + "│ B ┆ 2 ┆ 9909 │\n", + "│ C ┆ 0 ┆ 13224 │\n", + "│ C ┆ 1 ┆ 13224 │\n", + "│ C ┆ 2 ┆ 13224 │\n", "└──────────┴───────┴───────┘" ] }, @@ -563,7 +563,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (9, 3)
categoryflagslen
stri32u32
"A"09996
"A"19996
"A"29996
"B"09970
"B"19970
"B"29970
"C"010000
"C"110000
"C"210000
" + "shape: (9, 3)
categoryflagslen
stri32u32
"A"09865
"A"19865
"A"29865
"B"09909
"B"19909
"B"29909
"C"010000
"C"110000
"C"210000
" ], "text/plain": [ "shape: (9, 3)\n", @@ -572,12 +572,12 @@ "│ --- ┆ --- ┆ --- │\n", "│ str ┆ i32 ┆ u32 │\n", "╞══════════╪═══════╪═══════╡\n", - "│ A ┆ 0 ┆ 9996 │\n", - "│ A ┆ 1 ┆ 9996 │\n", - "│ A ┆ 2 ┆ 9996 │\n", - "│ B ┆ 0 ┆ 9970 │\n", - "│ B ┆ 1 ┆ 9970 │\n", - "│ B ┆ 2 ┆ 9970 │\n", + "│ A ┆ 0 ┆ 9865 │\n", + "│ A ┆ 1 ┆ 9865 │\n", + "│ A ┆ 2 ┆ 9865 │\n", + "│ B ┆ 0 ┆ 9909 │\n", + "│ B ┆ 1 ┆ 9909 │\n", + "│ B ┆ 2 ┆ 9909 │\n", "│ C ┆ 0 ┆ 10000 │\n", "│ C ┆ 1 ┆ 10000 │\n", "│ C ┆ 2 ┆ 10000 │\n", diff --git a/mkdocs.yml b/mkdocs.yml index e751aa33..190fd62c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -6,6 +6,8 @@ use_directory_urls: false nav: - Home: index.md - Diagnosis: dia.md +- Pipeline: pipeline.md +- Sample: sample.md - Numerical Extension: num.md - Stats Extension: stats.md - String Extension: str2.md diff --git a/python/polars_ds/sample.py b/python/polars_ds/sample.py index ca44c815..469ee9f5 100644 --- a/python/polars_ds/sample.py +++ b/python/polars_ds/sample.py @@ -47,9 +47,7 @@ def volume_neutral( seed: Optional[int] = None, ) -> pl.DataFrame: """ - Say we have a reference column, which is discrete. Let's say it has three distinct values, A, - B, and C, with a, b, c being the value counts. It will randomly select min(a, b, c, target_volume) - rows from each category, thus the name volume neutral. + Select volume neutral many population from each segment in `by`, with optional control categories. Parameters ---------- @@ -62,7 +60,7 @@ def volume_neutral( Additional level(s). If not none, the volume neutral selection will happen at the sublevel of the control column(s). See example. target_volume - If none, it will select min(a, b, c) rows, this means that one group is always fully selected. + If none, it will select min(a, b, c) rows. seed A random seed """