diff --git a/poetry.lock b/poetry.lock index 34fdb7d55..327dcbba0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -388,7 +388,7 @@ dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"] [[package]] name = "dlt" -version = "0.3.7" +version = "0.3.8" description = "DLT is an open-source python-native scalable data loading framework that does not require any devops efforts to run." category = "main" optional = false @@ -707,7 +707,7 @@ gitdb = ">=4.0.1,<5" [[package]] name = "giturlparse" -version = "0.11.0" +version = "0.11.1" description = "A Git URL parsing module (supports parsing and rewriting)" category = "main" optional = false @@ -1503,7 +1503,7 @@ python-versions = ">=3.7" [[package]] name = "overrides" -version = "7.3.1" +version = "7.4.0" description = "A decorator to automatically detect mismatch when overriding a method." category = "dev" optional = false @@ -1738,7 +1738,7 @@ python-versions = ">=3.7" [[package]] name = "psycopg2-binary" -version = "2.9.6" +version = "2.9.7" description = "psycopg2 - Python-PostgreSQL Database Adapter" category = "main" optional = false @@ -1854,7 +1854,7 @@ python-versions = ">=3.8" [[package]] name = "pygments" -version = "2.15.1" +version = "2.16.1" description = "Pygments is a syntax highlighting package written in Python." category = "dev" optional = false @@ -2597,7 +2597,7 @@ python-versions = ">=3.7" [[package]] name = "wheel" -version = "0.41.0" +version = "0.41.1" description = "A built-package format for Python" category = "main" optional = false @@ -2700,7 +2700,7 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "1.1" python-versions = "^3.8.1" -content-hash = "4a19e00b10f5adb1f21064c7244f48107fd5884833b31d91e0ada65b834d9b44" +content-hash = "0bd511b05b65c1a3236b3a57393fc194b710a663405bf1345c885d6f48396805" [metadata.files] aiohttp = [ @@ -3142,8 +3142,8 @@ deprecated = [ {file = "Deprecated-1.2.14.tar.gz", hash = "sha256:e5323eb936458dccc2582dc6f9c322c852a775a27065ff2b0c4970b9d53d01b3"}, ] dlt = [ - {file = "dlt-0.3.7-py3-none-any.whl", hash = "sha256:1b3ada51efb42631efc9af2011b16f4a6e7894d562f653c76ddee148360c1301"}, - {file = "dlt-0.3.7.tar.gz", hash = "sha256:9949f3767d079ae05b0c0caba5e4cb83dd08abc5b7ba10fdc195d13c8553fdc0"}, + {file = "dlt-0.3.8-py3-none-any.whl", hash = "sha256:bbe0a9e4b5732ce9ffb604948b382b9cfc5810d6e185d5fa3f50603b671b6737"}, + {file = "dlt-0.3.8.tar.gz", hash = "sha256:db2882e3cc4adecdd61e4e8c20a00b5c4a5715d81d67069681cc661a3c811f6e"}, ] domdf-python-tools = [ {file = "domdf_python_tools-3.6.1-py3-none-any.whl", hash = "sha256:e18158460850957f18e740eb94ede56f580ddb0cb162ab9d9834ed8bbb1b6431"}, @@ -3331,8 +3331,8 @@ gitpython = [ {file = "GitPython-3.1.32.tar.gz", hash = "sha256:8d9b8cb1e80b9735e8717c9362079d3ce4c6e5ddeebedd0361b228c3a67a62f6"}, ] giturlparse = [ - {file = "giturlparse-0.11.0-py2.py3-none-any.whl", hash = "sha256:66871cf1f53a32b9f8b541ffd194375ecaad6ba288f64c4dd834408a2ba4af89"}, - {file = "giturlparse-0.11.0.tar.gz", hash = "sha256:85a269967a1659bd8f71c95240dc16e74273da4c6d32a2021e9c3b2eef0a5d51"}, + {file = "giturlparse-0.11.1-py2.py3-none-any.whl", hash = "sha256:6422f25c8ca563e1a3cb6b85862e48614be804cd1334e6d84be5630eb26b343f"}, + {file = "giturlparse-0.11.1.tar.gz", hash = "sha256:cdbe0c062096c69e00f08397826dddebc1f73bc15b793994579c13aafc70c990"}, ] google-analytics-data = [ {file = "google-analytics-data-0.16.3.tar.gz", hash = "sha256:f29431ec63ab462f7a9b42227521d148c877307c629e308c284025ad834aab52"}, @@ -4113,8 +4113,8 @@ orjson = [ {file = "orjson-3.9.2.tar.gz", hash = "sha256:24257c8f641979bf25ecd3e27251b5cc194cdd3a6e96004aac8446f5e63d9664"}, ] overrides = [ - {file = "overrides-7.3.1-py3-none-any.whl", hash = "sha256:6187d8710a935d09b0bcef8238301d6ee2569d2ac1ae0ec39a8c7924e27f58ca"}, - {file = "overrides-7.3.1.tar.gz", hash = "sha256:8b97c6c1e1681b78cbc9424b138d880f0803c2254c5ebaabdde57bb6c62093f2"}, + {file = "overrides-7.4.0-py3-none-any.whl", hash = "sha256:3ad24583f86d6d7a49049695efe9933e67ba62f0c7625d53c59fa832ce4b8b7d"}, + {file = "overrides-7.4.0.tar.gz", hash = "sha256:9502a3cca51f4fac40b5feca985b6703a5c1f6ad815588a7ca9e285b9dca6757"}, ] packaging = [ {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"}, @@ -4292,68 +4292,66 @@ protobuf = [ {file = "protobuf-4.23.4.tar.gz", hash = "sha256:ccd9430c0719dce806b93f89c91de7977304729e55377f872a92465d548329a9"}, ] psycopg2-binary = [ - {file = "psycopg2-binary-2.9.6.tar.gz", hash = "sha256:1f64dcfb8f6e0c014c7f55e51c9759f024f70ea572fbdef123f85318c297947c"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d26e0342183c762de3276cca7a530d574d4e25121ca7d6e4a98e4f05cb8e4df7"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c48d8f2db17f27d41fb0e2ecd703ea41984ee19362cbce52c097963b3a1b4365"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffe9dc0a884a8848075e576c1de0290d85a533a9f6e9c4e564f19adf8f6e54a7"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a76e027f87753f9bd1ab5f7c9cb8c7628d1077ef927f5e2446477153a602f2c"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6460c7a99fc939b849431f1e73e013d54aa54293f30f1109019c56a0b2b2ec2f"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae102a98c547ee2288637af07393dd33f440c25e5cd79556b04e3fca13325e5f"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9972aad21f965599ed0106f65334230ce826e5ae69fda7cbd688d24fa922415e"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7a40c00dbe17c0af5bdd55aafd6ff6679f94a9be9513a4c7e071baf3d7d22a70"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:cacbdc5839bdff804dfebc058fe25684cae322987f7a38b0168bc1b2df703fb1"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7f0438fa20fb6c7e202863e0d5ab02c246d35efb1d164e052f2f3bfe2b152bd0"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-win32.whl", hash = "sha256:b6c8288bb8a84b47e07013bb4850f50538aa913d487579e1921724631d02ea1b"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-win_amd64.whl", hash = "sha256:61b047a0537bbc3afae10f134dc6393823882eb263088c271331602b672e52e9"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:964b4dfb7c1c1965ac4c1978b0f755cc4bd698e8aa2b7667c575fb5f04ebe06b"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:afe64e9b8ea66866a771996f6ff14447e8082ea26e675a295ad3bdbffdd72afb"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15e2ee79e7cf29582ef770de7dab3d286431b01c3bb598f8e05e09601b890081"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dfa74c903a3c1f0d9b1c7e7b53ed2d929a4910e272add6700c38f365a6002820"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b83456c2d4979e08ff56180a76429263ea254c3f6552cd14ada95cff1dec9bb8"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0645376d399bfd64da57148694d78e1f431b1e1ee1054872a5713125681cf1be"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e99e34c82309dd78959ba3c1590975b5d3c862d6f279f843d47d26ff89d7d7e1"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4ea29fc3ad9d91162c52b578f211ff1c931d8a38e1f58e684c45aa470adf19e2"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:4ac30da8b4f57187dbf449294d23b808f8f53cad6b1fc3623fa8a6c11d176dd0"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e78e6e2a00c223e164c417628572a90093c031ed724492c763721c2e0bc2a8df"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-win32.whl", hash = "sha256:1876843d8e31c89c399e31b97d4b9725a3575bb9c2af92038464231ec40f9edb"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-win_amd64.whl", hash = "sha256:b4b24f75d16a89cc6b4cdff0eb6a910a966ecd476d1e73f7ce5985ff1328e9a6"}, - {file = "psycopg2_binary-2.9.6-cp36-cp36m-win32.whl", hash = "sha256:498807b927ca2510baea1b05cc91d7da4718a0f53cb766c154c417a39f1820a0"}, - {file = "psycopg2_binary-2.9.6-cp36-cp36m-win_amd64.whl", hash = "sha256:0d236c2825fa656a2d98bbb0e52370a2e852e5a0ec45fc4f402977313329174d"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:34b9ccdf210cbbb1303c7c4db2905fa0319391bd5904d32689e6dd5c963d2ea8"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84d2222e61f313c4848ff05353653bf5f5cf6ce34df540e4274516880d9c3763"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30637a20623e2a2eacc420059be11527f4458ef54352d870b8181a4c3020ae6b"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8122cfc7cae0da9a3077216528b8bb3629c43b25053284cc868744bfe71eb141"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38601cbbfe600362c43714482f43b7c110b20cb0f8172422c616b09b85a750c5"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c7e62ab8b332147a7593a385d4f368874d5fe4ad4e341770d4983442d89603e3"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2ab652e729ff4ad76d400df2624d223d6e265ef81bb8aa17fbd63607878ecbee"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:c83a74b68270028dc8ee74d38ecfaf9c90eed23c8959fca95bd703d25b82c88e"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d4e6036decf4b72d6425d5b29bbd3e8f0ff1059cda7ac7b96d6ac5ed34ffbacd"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-win32.whl", hash = "sha256:a8c28fd40a4226b4a84bdf2d2b5b37d2c7bd49486b5adcc200e8c7ec991dfa7e"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-win_amd64.whl", hash = "sha256:51537e3d299be0db9137b321dfb6a5022caaab275775680e0c3d281feefaca6b"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cf4499e0a83b7b7edcb8dabecbd8501d0d3a5ef66457200f77bde3d210d5debb"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7e13a5a2c01151f1208d5207e42f33ba86d561b7a89fca67c700b9486a06d0e2"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e0f754d27fddcfd74006455b6e04e6705d6c31a612ec69ddc040a5468e44b4e"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d57c3fd55d9058645d26ae37d76e61156a27722097229d32a9e73ed54819982a"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71f14375d6f73b62800530b581aed3ada394039877818b2d5f7fc77e3bb6894d"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:441cc2f8869a4f0f4bb408475e5ae0ee1f3b55b33f350406150277f7f35384fc"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:65bee1e49fa6f9cf327ce0e01c4c10f39165ee76d35c846ade7cb0ec6683e303"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:af335bac6b666cc6aea16f11d486c3b794029d9df029967f9938a4bed59b6a19"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:cfec476887aa231b8548ece2e06d28edc87c1397ebd83922299af2e051cf2827"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:65c07febd1936d63bfde78948b76cd4c2a411572a44ac50719ead41947d0f26b"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-win32.whl", hash = "sha256:4dfb4be774c4436a4526d0c554af0cc2e02082c38303852a36f6456ece7b3503"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-win_amd64.whl", hash = "sha256:02c6e3cf3439e213e4ee930308dc122d6fb4d4bea9aef4a12535fbd605d1a2fe"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e9182eb20f41417ea1dd8e8f7888c4d7c6e805f8a7c98c1081778a3da2bee3e4"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8a6979cf527e2603d349a91060f428bcb135aea2be3201dff794813256c274f1"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8338a271cb71d8da40b023a35d9c1e919eba6cbd8fa20a54b748a332c355d896"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3ed340d2b858d6e6fb5083f87c09996506af483227735de6964a6100b4e6a54"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f81e65376e52f03422e1fb475c9514185669943798ed019ac50410fb4c4df232"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfb13af3c5dd3a9588000910178de17010ebcccd37b4f9794b00595e3a8ddad3"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4c727b597c6444a16e9119386b59388f8a424223302d0c06c676ec8b4bc1f963"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:4d67fbdaf177da06374473ef6f7ed8cc0a9dc640b01abfe9e8a2ccb1b1402c1f"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:0892ef645c2fabb0c75ec32d79f4252542d0caec1d5d949630e7d242ca4681a3"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:02c0f3757a4300cf379eb49f543fb7ac527fb00144d39246ee40e1df684ab514"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-win32.whl", hash = "sha256:c3dba7dab16709a33a847e5cd756767271697041fbe3fe97c215b1fc1f5c9848"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-win_amd64.whl", hash = "sha256:f6a88f384335bb27812293fdb11ac6aee2ca3f51d3c7820fe03de0a304ab6249"}, + {file = "psycopg2-binary-2.9.7.tar.gz", hash = "sha256:1b918f64a51ffe19cd2e230b3240ba481330ce1d4b7875ae67305bd1d37b041c"}, + {file = "psycopg2_binary-2.9.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ea5f8ee87f1eddc818fc04649d952c526db4426d26bab16efbe5a0c52b27d6ab"}, + {file = "psycopg2_binary-2.9.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2993ccb2b7e80844d534e55e0f12534c2871952f78e0da33c35e648bf002bbff"}, + {file = "psycopg2_binary-2.9.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dbbc3c5d15ed76b0d9db7753c0db40899136ecfe97d50cbde918f630c5eb857a"}, + {file = "psycopg2_binary-2.9.7-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:692df8763b71d42eb8343f54091368f6f6c9cfc56dc391858cdb3c3ef1e3e584"}, + {file = "psycopg2_binary-2.9.7-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9dcfd5d37e027ec393a303cc0a216be564b96c80ba532f3d1e0d2b5e5e4b1e6e"}, + {file = "psycopg2_binary-2.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17cc17a70dfb295a240db7f65b6d8153c3d81efb145d76da1e4a096e9c5c0e63"}, + {file = "psycopg2_binary-2.9.7-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e5666632ba2b0d9757b38fc17337d84bdf932d38563c5234f5f8c54fd01349c9"}, + {file = "psycopg2_binary-2.9.7-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7db7b9b701974c96a88997d458b38ccb110eba8f805d4b4f74944aac48639b42"}, + {file = "psycopg2_binary-2.9.7-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:c82986635a16fb1fa15cd5436035c88bc65c3d5ced1cfaac7f357ee9e9deddd4"}, + {file = "psycopg2_binary-2.9.7-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4fe13712357d802080cfccbf8c6266a3121dc0e27e2144819029095ccf708372"}, + {file = "psycopg2_binary-2.9.7-cp310-cp310-win32.whl", hash = "sha256:122641b7fab18ef76b18860dd0c772290566b6fb30cc08e923ad73d17461dc63"}, + {file = "psycopg2_binary-2.9.7-cp310-cp310-win_amd64.whl", hash = "sha256:f8651cf1f144f9ee0fa7d1a1df61a9184ab72962531ca99f077bbdcba3947c58"}, + {file = "psycopg2_binary-2.9.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4ecc15666f16f97709106d87284c136cdc82647e1c3f8392a672616aed3c7151"}, + {file = "psycopg2_binary-2.9.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3fbb1184c7e9d28d67671992970718c05af5f77fc88e26fd7136613c4ece1f89"}, + {file = "psycopg2_binary-2.9.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a7968fd20bd550431837656872c19575b687f3f6f98120046228e451e4064df"}, + {file = "psycopg2_binary-2.9.7-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:094af2e77a1976efd4956a031028774b827029729725e136514aae3cdf49b87b"}, + {file = "psycopg2_binary-2.9.7-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:26484e913d472ecb6b45937ea55ce29c57c662066d222fb0fbdc1fab457f18c5"}, + {file = "psycopg2_binary-2.9.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f309b77a7c716e6ed9891b9b42953c3ff7d533dc548c1e33fddc73d2f5e21f9"}, + {file = "psycopg2_binary-2.9.7-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6d92e139ca388ccfe8c04aacc163756e55ba4c623c6ba13d5d1595ed97523e4b"}, + {file = "psycopg2_binary-2.9.7-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:2df562bb2e4e00ee064779902d721223cfa9f8f58e7e52318c97d139cf7f012d"}, + {file = "psycopg2_binary-2.9.7-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:4eec5d36dbcfc076caab61a2114c12094c0b7027d57e9e4387b634e8ab36fd44"}, + {file = "psycopg2_binary-2.9.7-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1011eeb0c51e5b9ea1016f0f45fa23aca63966a4c0afcf0340ccabe85a9f65bd"}, + {file = "psycopg2_binary-2.9.7-cp311-cp311-win32.whl", hash = "sha256:ded8e15f7550db9e75c60b3d9fcbc7737fea258a0f10032cdb7edc26c2a671fd"}, + {file = "psycopg2_binary-2.9.7-cp311-cp311-win_amd64.whl", hash = "sha256:8a136c8aaf6615653450817a7abe0fc01e4ea720ae41dfb2823eccae4b9062a3"}, + {file = "psycopg2_binary-2.9.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2dec5a75a3a5d42b120e88e6ed3e3b37b46459202bb8e36cd67591b6e5feebc1"}, + {file = "psycopg2_binary-2.9.7-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc10da7e7df3380426521e8c1ed975d22df678639da2ed0ec3244c3dc2ab54c8"}, + {file = "psycopg2_binary-2.9.7-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee919b676da28f78f91b464fb3e12238bd7474483352a59c8a16c39dfc59f0c5"}, + {file = "psycopg2_binary-2.9.7-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:eb1c0e682138f9067a58fc3c9a9bf1c83d8e08cfbee380d858e63196466d5c86"}, + {file = "psycopg2_binary-2.9.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00d8db270afb76f48a499f7bb8fa70297e66da67288471ca873db88382850bf4"}, + {file = "psycopg2_binary-2.9.7-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:9b0c2b466b2f4d89ccc33784c4ebb1627989bd84a39b79092e560e937a11d4ac"}, + {file = "psycopg2_binary-2.9.7-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:51d1b42d44f4ffb93188f9b39e6d1c82aa758fdb8d9de65e1ddfe7a7d250d7ad"}, + {file = "psycopg2_binary-2.9.7-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:11abdbfc6f7f7dea4a524b5f4117369b0d757725798f1593796be6ece20266cb"}, + {file = "psycopg2_binary-2.9.7-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:f02f4a72cc3ab2565c6d9720f0343cb840fb2dc01a2e9ecb8bc58ccf95dc5c06"}, + {file = "psycopg2_binary-2.9.7-cp37-cp37m-win32.whl", hash = "sha256:81d5dd2dd9ab78d31a451e357315f201d976c131ca7d43870a0e8063b6b7a1ec"}, + {file = "psycopg2_binary-2.9.7-cp37-cp37m-win_amd64.whl", hash = "sha256:62cb6de84d7767164a87ca97e22e5e0a134856ebcb08f21b621c6125baf61f16"}, + {file = "psycopg2_binary-2.9.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:59f7e9109a59dfa31efa022e94a244736ae401526682de504e87bd11ce870c22"}, + {file = "psycopg2_binary-2.9.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:95a7a747bdc3b010bb6a980f053233e7610276d55f3ca506afff4ad7749ab58a"}, + {file = "psycopg2_binary-2.9.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c721ee464e45ecf609ff8c0a555018764974114f671815a0a7152aedb9f3343"}, + {file = "psycopg2_binary-2.9.7-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4f37bbc6588d402980ffbd1f3338c871368fb4b1cfa091debe13c68bb3852b3"}, + {file = "psycopg2_binary-2.9.7-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac83ab05e25354dad798401babaa6daa9577462136ba215694865394840e31f8"}, + {file = "psycopg2_binary-2.9.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:024eaeb2a08c9a65cd5f94b31ace1ee3bb3f978cd4d079406aef85169ba01f08"}, + {file = "psycopg2_binary-2.9.7-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1c31c2606ac500dbd26381145684d87730a2fac9a62ebcfbaa2b119f8d6c19f4"}, + {file = "psycopg2_binary-2.9.7-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:42a62ef0e5abb55bf6ffb050eb2b0fcd767261fa3faf943a4267539168807522"}, + {file = "psycopg2_binary-2.9.7-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7952807f95c8eba6a8ccb14e00bf170bb700cafcec3924d565235dffc7dc4ae8"}, + {file = "psycopg2_binary-2.9.7-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e02bc4f2966475a7393bd0f098e1165d470d3fa816264054359ed4f10f6914ea"}, + {file = "psycopg2_binary-2.9.7-cp38-cp38-win32.whl", hash = "sha256:fdca0511458d26cf39b827a663d7d87db6f32b93efc22442a742035728603d5f"}, + {file = "psycopg2_binary-2.9.7-cp38-cp38-win_amd64.whl", hash = "sha256:d0b16e5bb0ab78583f0ed7ab16378a0f8a89a27256bb5560402749dbe8a164d7"}, + {file = "psycopg2_binary-2.9.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6822c9c63308d650db201ba22fe6648bd6786ca6d14fdaf273b17e15608d0852"}, + {file = "psycopg2_binary-2.9.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8f94cb12150d57ea433e3e02aabd072205648e86f1d5a0a692d60242f7809b15"}, + {file = "psycopg2_binary-2.9.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5ee89587696d808c9a00876065d725d4ae606f5f7853b961cdbc348b0f7c9a1"}, + {file = "psycopg2_binary-2.9.7-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ad5ec10b53cbb57e9a2e77b67e4e4368df56b54d6b00cc86398578f1c635f329"}, + {file = "psycopg2_binary-2.9.7-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:642df77484b2dcaf87d4237792246d8068653f9e0f5c025e2c692fc56b0dda70"}, + {file = "psycopg2_binary-2.9.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6a8b575ac45af1eaccbbcdcf710ab984fd50af048fe130672377f78aaff6fc1"}, + {file = "psycopg2_binary-2.9.7-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f955aa50d7d5220fcb6e38f69ea126eafecd812d96aeed5d5f3597f33fad43bb"}, + {file = "psycopg2_binary-2.9.7-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ad26d4eeaa0d722b25814cce97335ecf1b707630258f14ac4d2ed3d1d8415265"}, + {file = "psycopg2_binary-2.9.7-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:ced63c054bdaf0298f62681d5dcae3afe60cbae332390bfb1acf0e23dcd25fc8"}, + {file = "psycopg2_binary-2.9.7-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2b04da24cbde33292ad34a40db9832a80ad12de26486ffeda883413c9e1b1d5e"}, + {file = "psycopg2_binary-2.9.7-cp39-cp39-win32.whl", hash = "sha256:18f12632ab516c47c1ac4841a78fddea6508a8284c7cf0f292cb1a523f2e2379"}, + {file = "psycopg2_binary-2.9.7-cp39-cp39-win_amd64.whl", hash = "sha256:eb3b8d55924a6058a26db69fb1d3e7e32695ff8b491835ba9f479537e14dcf9f"}, ] psycopg2cffi = [ {file = "psycopg2cffi-2.9.0.tar.gz", hash = "sha256:7e272edcd837de3a1d12b62185eb85c45a19feda9e62fa1b120c54f9e8d35c52"}, @@ -4479,8 +4477,8 @@ pyflakes = [ {file = "pyflakes-3.1.0.tar.gz", hash = "sha256:a0aae034c444db0071aa077972ba4768d40c830d9539fd45bf4cd3f8f6992efc"}, ] pygments = [ - {file = "Pygments-2.15.1-py3-none-any.whl", hash = "sha256:db2db3deb4b4179f399a09054b023b6a586b76499d36965813c71aa8ed7b5fd1"}, - {file = "Pygments-2.15.1.tar.gz", hash = "sha256:8ace4d3c1dd481894b2005f560ead0f9f19ee64fe983366be1a21e171d12775c"}, + {file = "Pygments-2.16.1-py3-none-any.whl", hash = "sha256:13fc09fa63bc8d8671a6d247e1eb303c4b343eaee81d861f3404db2935653692"}, + {file = "Pygments-2.16.1.tar.gz", hash = "sha256:1daff0494820c69bc8941e407aa20f577374ee88364ee10a98fdbe0aece96e29"}, ] pyjwt = [ {file = "PyJWT-2.8.0-py3-none-any.whl", hash = "sha256:59127c392cc44c2da5bb3192169a91f429924e17aff6534d70fdc02ab3e04320"}, @@ -5140,8 +5138,8 @@ websockets = [ {file = "websockets-11.0.3.tar.gz", hash = "sha256:88fc51d9a26b10fc331be344f1781224a375b78488fc343620184e95a4b27016"}, ] wheel = [ - {file = "wheel-0.41.0-py3-none-any.whl", hash = "sha256:7e9be3bbd0078f6147d82ed9ed957e323e7708f57e134743d2edef3a7b7972a9"}, - {file = "wheel-0.41.0.tar.gz", hash = "sha256:55a0f0a5a84869bce5ba775abfd9c462e3a6b1b7b7ec69d72c0b83d673a5114d"}, + {file = "wheel-0.41.1-py3-none-any.whl", hash = "sha256:473219bd4cbedc62cea0cb309089b593e47c15c4a2531015f94e4e3b9a0f6981"}, + {file = "wheel-0.41.1.tar.gz", hash = "sha256:12b911f083e876e10c595779709f8a88a59f45aacc646492a67fe9ef796c1b47"}, ] wrapt = [ {file = "wrapt-1.15.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ca1cccf838cd28d5a0883b342474c630ac48cac5df0ee6eacc9c7290f76b11c1"}, diff --git a/pyproject.toml b/pyproject.toml index add473cfe..a21cb3122 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ packages = [{include = "sources"}] [tool.poetry.dependencies] python = "^3.8.1" black = "^23.3.0" -dlt = {version = "^0.3.5a", allow-prereleases = true, extras = ["redshift", "bigquery", "postgres", "duckdb"]} +dlt = {version = "^0.3.8", allow-prereleases = true, extras = ["redshift", "bigquery", "postgres", "duckdb"]} [tool.poetry.group.dev.dependencies] mypy = "^0.991" diff --git a/sources/google_sheets/README.md b/sources/google_sheets/README.md index 8e8314a1d..9e2bc226f 100644 --- a/sources/google_sheets/README.md +++ b/sources/google_sheets/README.md @@ -1,62 +1,92 @@ # Google Sheets -This verified source can be used to load data from a [Google Sheets](https://www.google.com/sheets/about/) workspace onto a destination of your choice. +## Prepare your data -| Endpoints | Description | -| --- | --- | -| Tables | tables of the spreadsheet, tables have same name as individual sheets | -| Named ranges | loaded as a separate column with an automatically generated header | -| Merged cells | retains only the cell value that was taken during the merge (e.g., top-leftmost), and every other cell in the merge is given a null value | +We recommend to to use [Named Ranges](link to gsheets) to indicate which data should be extracted from a particular spreadsheet and this is how this source +will work by default - when called with without setting any other options. All the named ranges will be converted into tables named after them and stored in the +destination. +* You can let the spreadsheet users to add and remove tables by just adding/removing the ranges, you do not need to configure the pipeline again. +* You can indicate exactly the fragments of interest and only this data will be retrieved so it is the fastest. +* You can name database tables by changing the range names. +If you are not happy with the workflow above, you can: +* Disable it by setting `get_named_ranges` option to False +* Enable retrieving all sheets/tabs with `get_sheets` option set to True +* Pass a list of ranges as supported by Google Sheets in `range_names` -Initialize a `dlt` project with the following command: -```bash -dlt init google_sheets bigquery +### Make sure your data has headers and is a proper table +**First row of any extracted range should contain headers**. Please make sure: +1. The header names are strings and are unique. +2. That all the columns that you intend to extract have a header. +3. That data starts exactly at the origin of the range - otherwise source will remove padding but it is a waste of resources! + +When source detects any problems with headers or table layout **it will issue a WARNING in the log** so it makes sense to run your pipeline script manually/locally and fix all the problems. +1. Columns without headers will be removed and not extracted! +2. Columns with headers that does not contain any data will be removed. +2. If there's any problems with reading headers (ie. header is not string or is empty or not unique): **the headers row will be extracted as data** and automatic header names will be used. +3. Empty rows are ignored +4. `dlt` will normalize range names and headers into table and column names - so they may be different in the database than in google sheets. Prefer small cap names without special characters! + +### Data Types +`dlt` normalizer will use first row of data to infer types and will try to coerce following rows - creating variant columns if that is not possible. This is a standard behavior. +**date time** and **date** types are also recognized and this happens via additional metadata that is retrieved for the first row. + +## Passing the spreadsheet id/url and explicit range names +You can use both url of your spreadsheet that you can copy from the browser ie. +``` +https://docs.google.com/spreadsheets/d/1VTtCiYgxjAwcIw7UM1_BSaxC3rzIpr0HwXZwd2OlPD4/edit?usp=sharing +``` +or spreadsheet id (which is a part of the url) +``` +1VTtCiYgxjAwcIw7UM1_BSaxC3rzIpr0HwXZwd2OlPD4 +``` +typically you pass it directly to the `google_spreadsheet` function + +**passing ranges** + +You can pass explicit ranges to the `google_spreadsheet`: +1. sheet names +2. named ranges +3. any range in Google Sheet format ie. **sheet 1!A1:B7** + + +## The `spreadsheet_info` table +This table is repopulated after every load and keeps the information on loaded ranges: +* id of the spreadsheet +* name of the range as passed to the source +* string representation of the loaded range +* range above in parsed representation + +## Running on Airflow (and some under the hood information) +Internally, the source loads all the data immediately in the `google_spreadsheet` before execution of the pipeline in `run`. No matter how many ranges you request, we make just two calls to the API to retrieve data. This works very well with typical scripts that create a dlt source with `google_spreadsheet` and then run it with `pipeline.run`. + +In case of Airflow, the source is created and executed separately. In typical configuration where runner is a separate machine, **this will load data twice**. + +**Moreover, you should not use `scc` decomposition in our Airflow helper**. It will create an instance of the source for each requested range in order to run a task that corresponds to it! Following our [Airflow deployment guide](https://dlthub.com/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer#2-modify-dag-file), this is how you should use `tasks.add_run` on `PipelineTasksGroup`: +```python +@dag( + schedule_interval='@daily', + start_date=pendulum.datetime(2023, 2, 1), + catchup=False, + max_active_runs=1, + default_args=default_task_args +) +def get_named_ranges(): + tasks = PipelineTasksGroup("get_named_ranges", use_data_folder=False, wipe_local_data=True) + + # import your source from pipeline script + from google_sheets import google_spreadsheet + + pipeline = dlt.pipeline( + pipeline_name="get_named_ranges", + dataset_name="named_ranges_data", + destination='bigquery', + ) + + # do not use decompose to run `google_spreadsheet` in single task + tasks.add_run(pipeline, google_spreadsheet("1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580"), decompose="none", trigger_rule="all_done", retries=0, provide_context=True) ``` -Here, we chose BigQuery as the destination. To choose a different destination, replace `bigquery` with your choice of [destination.](https://dlthub.com/docs/dlt-ecosystem/destinations) - -## Grab Google Sheets credentials - -To read about grabbing the Google Sheets credentials and configuring the verified source, please refer to the [full documentation here.](https://dlthub.com/docs/dlt-ecosystem/verified-sources/google_sheets#google-sheets-api-authentication) - -## Add credentials - -1. Open `.dlt/secrets.toml` -2. From the .json that you downloaded earlier, copy “project_id”, “private_key”, and “client_email” as follows: - - ```toml - [sources.google_spreadsheet.credentials] - project_id = "set me up" # GCP Source project ID! - private_key = "set me up" # Unique private key !(Must be copied fully including BEGIN and END PRIVATE KEY) - client_email = "set me up" # Email for source service account - location = "set me up" #Project Location For ex. “US” - - ``` - -3. Enter credentials for your chosen destination as per the [docs](https://dlthub.com/docs/dlt-ecosystem/destinations/). - -## Run the pipeline - -1. Install the requirements by using the following command: - - ```bash - pip install -r requirements.txt - ``` - -2. Run the pipeline by using the following command: - - ```bash - python3 google_sheets_pipelines.py - ``` - -3. Use the following command to make sure that everything loaded as expected: - - ```bash - dlt pipeline google_sheets_pipeline show - ``` - - - -💡 To explore additional customizations for this pipeline, we recommend referring to the official DLT Google Sheets documentation. It provides comprehensive information and guidance on how to further customize and tailor the pipeline to suit your specific needs. You can find the DLT Google Sheets documentation in [Setup Guide: Google Sheets](https://dlthub.com/docs/dlt-ecosystem/verified-sources/google_sheets). +## Setup credentials +[We recommend to use service account for any production deployments](https://dlthub.com/docs/dlt-ecosystem/verified-sources/google_sheets#google-sheets-api-authentication) diff --git a/sources/google_sheets/__init__.py b/sources/google_sheets/__init__.py index 8fed97d50..1f0d87f78 100644 --- a/sources/google_sheets/__init__.py +++ b/sources/google_sheets/__init__.py @@ -1,33 +1,31 @@ """Loads Google Sheets data from tabs, named and explicit ranges. Contains the main source functions.""" -from typing import Iterator, List, Sequence, Union, Iterable +from typing import Sequence, Union, Iterable import dlt from dlt.common import logger -from dlt.common.typing import DictStrAny, Dict, TDataItem, StrAny -from dlt.common.exceptions import MissingDependencyException from dlt.sources.credentials import GcpServiceAccountCredentials, GcpOAuthCredentials from dlt.extract.source import DltResource from .helpers.data_processing import ( - convert_named_range_to_a1, + ParsedRange, + get_data_types, + get_range_headers, get_spreadsheet_id, process_range, ) from .helpers.api_calls import api_auth from .helpers import api_calls -from apiclient.discovery import Resource - @dlt.source def google_spreadsheet( - spreadsheet_identifier: str = dlt.config.value, + spreadsheet_url_or_id: str = dlt.config.value, range_names: Sequence[str] = dlt.config.value, credentials: Union[ - GcpServiceAccountCredentials, GcpOAuthCredentials + GcpOAuthCredentials, GcpServiceAccountCredentials ] = dlt.secrets.value, - get_sheets: bool = True, + get_sheets: bool = False, get_named_ranges: bool = True, ) -> Iterable[DltResource]: """ @@ -36,13 +34,13 @@ def google_spreadsheet( - Optionally, dlt resources for all sheets inside the spreadsheet and all named ranges inside the spreadsheet. Args: - spreadsheet_identifier (str): The ID or URL of the spreadsheet. - range_names (Sequence[str]): A list of ranges in the spreadsheet of the format "sheet_name!range_name". + spreadsheet_url_or_id (str): The ID or URL of the spreadsheet. + range_names (Sequence[str]): A list of ranges in the spreadsheet in the format used by Google Sheets. Accepts Named Ranges and Sheets (tabs) names. These are the ranges to be converted into tables. credentials (Union[GcpServiceAccountCredentials, GcpOAuthCredentials]): GCP credentials to the account with Google Sheets API access, defined in dlt.secrets. get_sheets (bool, optional): If True, load all the sheets inside the spreadsheet into the database. - Defaults to True. + Defaults to False. get_named_ranges (bool, optional): If True, load all the named ranges inside the spreadsheet into the database. Defaults to True. @@ -52,135 +50,107 @@ def google_spreadsheet( # authenticate to the service using the helper function service = api_auth(credentials) # get spreadsheet id from url or id - spreadsheet_id = get_spreadsheet_id(spreadsheet_identifier) - # Initialize a list with the values in range_names (can be an array if declared in config.toml). This needs to be converted to a list because it will be used as input by google-api-python-client - # type needs to be checked instead of isinstance since toml Arrays have lists as a superclass - if type(range_names) is not list: - ranges_list = [range_name for range_name in range_names] - else: - ranges_list = range_names - # if sheet names or named_ranges are to be added as tables, an extra api call is made. - named_ranges = None - if get_sheets or get_named_ranges: - # get metadata and append to list of ranges as needed - simple_metadata = api_calls.get_metadata_simple( + spreadsheet_id = get_spreadsheet_id(spreadsheet_url_or_id) + all_range_names = set(range_names or []) + # if no explicit ranges, get sheets and named ranges from metadata + if not range_names: + # get metadata with list of sheets and named ranges in the spreadsheet + sheet_names, named_ranges = api_calls.get_known_range_names( spreadsheet_id=spreadsheet_id, service=service ) if get_sheets: - ranges_list += list(simple_metadata["sheets"].values()) + all_range_names.update(sheet_names) if get_named_ranges: - named_ranges = { - convert_named_range_to_a1( - named_range_dict=named_range, - sheet_names_dict=simple_metadata["sheets"], - ): named_range["name"] - for named_range in simple_metadata["named_ranges"] - } - ranges_list += list(named_ranges.keys()) - # get data and metadata - metadata_ranges_all = api_calls.get_metadata( - spreadsheet_id=spreadsheet_id, - service=service, - ranges=ranges_list, - named_ranges=named_ranges, - ) + all_range_names.update(named_ranges) - # create a list of dlt resources from the data and metadata - yield from get_data( + # first we get all data for all the ranges (explicit or named) + all_range_data = api_calls.get_data_for_ranges( service=service, spreadsheet_id=spreadsheet_id, - range_names=ranges_list, - metadata_dict=metadata_ranges_all, + range_names=list(all_range_names), ) - - # create metadata resource - yield metadata_table( - spreadsheet_info=metadata_ranges_all, spreadsheet_id=spreadsheet_id + assert len(all_range_names) == len( + all_range_data + ), "Google Sheets API must return values for all requested ranges" + + # get metadata for two first rows of each range + # first should contain headers + # second row contains data which we'll use to sample data types. + # google sheets return datetime and date types as lotus notes serial number. which is just a float so we cannot infer the correct types just from the data + + # warn and remove empty ranges + range_data = [] + metadata_table = [] + for name, parsed_range, meta_range, values in all_range_data: + # pass all ranges to spreadsheet info - including empty + metadata_table.append( + { + "spreadsheet_id": spreadsheet_id, + "range_name": name, + "range": str(parsed_range), + "range_parsed": parsed_range._asdict(), + "skipped": True, + } + ) + if values is None or len(values) == 0: + logger.warning(f"Range {name} does not contain any data. Skipping.") + continue + if len(values) == 1: + logger.warning(f"Range {name} contain only 1 row of data. Skipping.") + continue + if len(values[0]) == 0: + logger.warning( + f"First row of range {name} does not contain data. Skipping." + ) + continue + metadata_table[-1]["skipped"] = False + range_data.append((name, parsed_range, meta_range, values)) + + meta_values = ( + service.spreadsheets() + .get( + spreadsheetId=spreadsheet_id, + ranges=[str(data[2]) for data in range_data], + includeGridData=True, + ) + .execute() ) + for name, parsed_range, _, values in range_data: + logger.info(f"Processing range {parsed_range} with name {name}") + # here is a tricky part due to how Google Sheets API returns the metadata. We are not able to directly pair the input range names with returned metadata objects + # instead metadata objects are grouped by sheet names, still each group order preserves the order of input ranges + # so for each range we get a sheet name, we look for the metadata group for that sheet and then we consume first object on that list with pop + metadata = next( + sheet + for sheet in meta_values["sheets"] + if sheet["properties"]["title"] == parsed_range.sheet_name + )["data"].pop(0) + + headers_metadata = metadata["rowData"][0]["values"] + headers = get_range_headers(headers_metadata, name) + if headers is None: + # generate automatic headers and treat the first row as data + headers = [f"col_{idx+1}" for idx in range(len(headers_metadata))] + data_row_metadata = headers_metadata + rows_data = values[0:] + logger.warning( + f"Using automatic headers. WARNING: first row of the range {name} will be used as data!" + ) + else: + # first row contains headers and is skipped + data_row_metadata = metadata["rowData"][1]["values"] + rows_data = values[1:] + + data_types = get_data_types(data_row_metadata) - -@dlt.resource(write_disposition="replace", name="spreadsheet_info") -def metadata_table( - spreadsheet_info: StrAny, spreadsheet_id: str -) -> Iterator[TDataItem]: - """ - Creates the metadata_table resource. It adds a table with all loaded ranges into a table. - - Args: - spreadsheet_info (StrAny): This is a dict where all loaded ranges are keys. - Inside the dict there is another dict with keys: "headers", "sheet_name", "index" and "values". - spreadsheet_id (str): The ID of the spreadsheet is included for extra info. - - Yields: - Iterator[TDataItem]: Generator of dicts with info on ranges that were loaded into the database. - """ - - # get keys for metadata dict and iterate through them - # the keys for this dict are the ranges where the data is gathered from - loaded_ranges = spreadsheet_info.keys() - for loaded_range in loaded_ranges: - # get needed info from dict - loaded_range_meta = spreadsheet_info[loaded_range] - range_num_headers = len(loaded_range_meta["headers"]) - range_sheet_name = loaded_range_meta["range"].split("!")[0] - # table structure - table_dict = { - "spreadsheet_id": spreadsheet_id, - "loaded_range": loaded_range, - "sheet_name": range_sheet_name, - "num_cols": range_num_headers, - } - # change name of loaded range name if it is a ranged name - if loaded_range_meta["name"]: - table_dict["loaded_range"] = loaded_range_meta["name"] - yield table_dict - - -def get_data( - service: Resource, - spreadsheet_id: str, - range_names: List[str], - metadata_dict: Dict[str, DictStrAny], -) -> Iterable[DltResource]: - """ - Makes an API call to Google Sheets and retrieves all the ranges listed in range_names. - Processes them into dlt resources. - - Args: - service (Resource): Object to make API calls to Google Sheets. - spreadsheet_id (str): The ID of the spreadsheet. - range_names (List[str]): List of range names. - metadata_dict (Dict[str, DictStrAny]): The dictionary with metadata. - - Yields: - Iterable[DltResource]: List of dlt resources, each containing a table of a specific range. - """ - - # get data from Google Sheets and iterate through them to process each range into a separate dlt resource - values = api_calls.get_data_batch( - service=service, spreadsheet_id=spreadsheet_id, range_names=range_names - ) - for value in values: - # get range name and metadata for sheet. Extra quotation marks returned by the API call are removed. - range_part1, range_part2 = value["range"].split("!") - range_part1 = range_part1.strip("'") - sheet_range_name = f"{range_part1}!{range_part2}" - named_range_name = None - try: - sheet_meta_batch = metadata_dict[sheet_range_name] - named_range_name = sheet_meta_batch["name"] - except KeyError: - try: - sheet_range_name = sheet_range_name.split("!")[0] - sheet_meta_batch = metadata_dict[sheet_range_name] - except KeyError: - logger.warning(f"Skipping data for empty range: {sheet_range_name}") - continue - # get range values - sheet_range = value["values"] - # create a resource from processing both sheet/range values yield dlt.resource( - process_range(sheet_val=sheet_range, sheet_meta=sheet_meta_batch), - name=named_range_name or sheet_range_name, + process_range(rows_data, headers=headers, data_types=data_types), + name=name, write_disposition="replace", ) + yield dlt.resource( + metadata_table, + write_disposition="merge", + name="spreadsheet_info", + merge_key="spreadsheet_id", + ) diff --git a/sources/google_sheets/docs_images/Add_people.png b/sources/google_sheets/docs_images/Add_people.png deleted file mode 100644 index 048819f43..000000000 Binary files a/sources/google_sheets/docs_images/Add_people.png and /dev/null differ diff --git a/sources/google_sheets/docs_images/Share_button.png b/sources/google_sheets/docs_images/Share_button.png deleted file mode 100644 index 2ac37ef1b..000000000 Binary files a/sources/google_sheets/docs_images/Share_button.png and /dev/null differ diff --git a/sources/google_sheets/helpers/api_calls.py b/sources/google_sheets/helpers/api_calls.py index 6ae5a9813..92a7c9370 100644 --- a/sources/google_sheets/helpers/api_calls.py +++ b/sources/google_sheets/helpers/api_calls.py @@ -1,14 +1,13 @@ -"""Contains helper functions to make API calls""" +"""Contains helper functions to extract data from spreadsheet API""" -from typing import List +from typing import Any, List, Tuple -from dlt.common import logger from dlt.common.exceptions import MissingDependencyException from dlt.common.typing import DictStrAny from dlt.sources.credentials import GcpCredentials, GcpOAuthCredentials -from .data_processing import get_first_line, get_range_headers, metadata_preprocessing +from .data_processing import ParsedRange, trim_range_top_left try: from apiclient.discovery import build, Resource @@ -33,118 +32,30 @@ def api_auth(credentials: GcpCredentials) -> Resource: return service -def get_metadata_simple(spreadsheet_id: str, service: Resource) -> DictStrAny: +def get_known_range_names( + spreadsheet_id: str, service: Resource +) -> Tuple[List[str], List[str]]: """ - Makes a simple get metadata API call which just returns information about the spreadsheet such as sheet names and named ranges. + Retrieves spreadsheet metadata and extracts a list of sheet names and named ranges Args: spreadsheet_id (str): The ID of the spreadsheet. service (Resource): Resource object used to make API calls to Google Sheets API. Returns: - DictStrAny: A dictionary containing information on sheets and named ranges. It has two keys: "sheets" and "named_ranges". + Tuple[List[str], List[str]] """ - return_info: DictStrAny = {"sheets": {}, "named_ranges": []} - # get metadata of spreadsheet to check for number of sheets inside metadata = service.spreadsheets().get(spreadsheetId=spreadsheet_id).execute() + sheet_names: List[str] = [s["properties"]["title"] for s in metadata["sheets"]] + named_ranges: List[str] = [r["name"] for r in metadata.get("namedRanges", {})] + return sheet_names, named_ranges - # metadata["sheets"] is a list containing dicts with info on sheets inside the spreadsheet - # iterate through the sheets in the metadata and get their names - for sheet_m in metadata["sheets"]: - # get name and append to list of sheet names - sheet_name = sheet_m["properties"]["title"] - sheet_id = sheet_m["properties"]["sheetId"] - return_info["sheets"][sheet_id] = sheet_name - logger.info(f"Found the following sheets {return_info['sheets']}") - - # this is a list containing dicts with info on named ranges - if "namedRanges" in metadata: - return_info["named_ranges"] = metadata["namedRanges"] - logger.info(f"Found the following sheets {return_info['named_ranges']}") - return return_info - - -def get_metadata( - spreadsheet_id: str, - service: Resource, - ranges: List[str], - named_ranges: DictStrAny = None, -) -> DictStrAny: - """ - TODO: add fields to save on info returned - Gets the metadata for the first 2 rows of every range specified. The first row is deduced as the header and the 2nd row specifies the format the rest of the data should follow. - - Args: - spreadsheet_id (str): The ID of the spreadsheet. - service (Resource): Resource object used by google-api-python-client to make API calls. - ranges (List[str]): List of ranges to get data from. If left empty, every sheet inside the spreadsheet will be included instead. Named ranges not supported. - named_ranges (DictStrAny, optional): Dict containing ranges as keys and the corresponding named ranges as the values. - - Returns: - DictStrAny: A dict where all the range names are the key. The values for each key are the corresponding sheet metadata: sheet_name, headers, values. - """ - # process metadata ranges so only the first 2 rows are appended - # response like dict will contain a dict similar to the response by the Google Sheets API: ranges are returned inside the sheets they belong in the order given in the API request. - meta_ranges, response_like_dict = metadata_preprocessing( - ranges=ranges, named_ranges=named_ranges - ) - spr_meta = ( - service.spreadsheets() - .get(spreadsheetId=spreadsheet_id, ranges=meta_ranges, includeGridData=True) - .execute() - ) - - # process and populate metadata in response like dict but return in from metadata_all_ranges because we need the data returned in a more organized format - metadata_all_ranges = {} - for sheet in spr_meta["sheets"]: - # get sheet name, so we can associate with dict and load the data into dict - meta_sheet_name = sheet["properties"]["title"] - sheet_data = sheet["data"] - - # skip record if not found in the response dict - if not (meta_sheet_name in response_like_dict): - continue - # get ranges inside the sheet in order - for i in range(len(sheet_data)): - metadata_range_name = response_like_dict[meta_sheet_name][i]["range"] - # check that sheet is not empty, otherwise delete - if not ("rowData" in sheet_data[i]): - logger.warning(f"Metadata - Skipped empty range: {metadata_range_name}") - continue - # get headers and 1st row data - range_metadata = sheet_data[i]["rowData"] - headers = get_range_headers( - range_metadata=range_metadata, range_name=metadata_range_name - ) - if not headers: - logger.warning( - f"Metadata: Skipped range with empty headers: {metadata_range_name}" - ) - continue - first_line_values = get_first_line(range_metadata=range_metadata) - if not first_line_values: - logger.warning( - f"Metadata: No data values for the first line of data {metadata_range_name}" - ) - # add headers and values - response_like_dict[meta_sheet_name][i]["headers"] = headers - response_like_dict[meta_sheet_name][i][ - "cols_is_datetime" - ] = first_line_values - # append dict to response - metadata_all_ranges[metadata_range_name] = response_like_dict[ - meta_sheet_name - ][i] - return metadata_all_ranges - - -def get_data_batch( +def get_data_for_ranges( service: Resource, spreadsheet_id: str, range_names: List[str] -) -> List[DictStrAny]: +) -> List[Tuple[str, ParsedRange, ParsedRange, List[List[Any]]]]: """ Calls Google Sheets API to get data in a batch. This is the most efficient way to get data for multiple ranges inside a spreadsheet. - However, this API call will return the data for each range without the same name that the range was called. Args: service (Resource): Object to make API calls to Google Sheets. @@ -152,17 +63,9 @@ def get_data_batch( range_names (List[str]): List of range names. Returns: - List[DictStrAny]: List of dictionaries, each dictionary will contain all data for one of the requested ranges. + List[DictStrAny]: A list of ranges with data in the same order as `range_names` """ - # handle requests with no ranges - edge case - if not range_names: - logger.warning( - "Fetching data error: No ranges to get data from. Check the input ranges are not empty." - ) - return [] - # Make an api call to get the data for all sheets and ranges - # get dates as serial number - values: List[DictStrAny] = ( + range_batch_resp = ( service.spreadsheets() .values() .batchGet( @@ -173,7 +76,19 @@ def get_data_batch( # will return formatted dates as a serial number dateTimeRenderOption="SERIAL_NUMBER", ) - .execute()["valueRanges"] + .execute() ) - logger.info("Data fetched") - return values + # if there are not ranges to be loaded, there's no "valueRanges" + range_batch: List[DictStrAny] = range_batch_resp.get("valueRanges", []) + # trim the empty top rows and columns from the left + rv = [] + for name, range_ in zip(range_names, range_batch): + parsed_range = ParsedRange.parse_range(range_["range"]) + values: List[List[Any]] = range_.get("values", None) + if values: + parsed_range, values = trim_range_top_left(parsed_range, values) + # create a new range to get first two rows + meta_range = parsed_range._replace(end_row=parsed_range.start_row + 1) + # print(f"{name}:{parsed_range}:{meta_range}") + rv.append((name, parsed_range, meta_range, values)) + return rv diff --git a/sources/google_sheets/helpers/data_processing.py b/sources/google_sheets/helpers/data_processing.py index 6e0c4ebbf..ca05f85bb 100644 --- a/sources/google_sheets/helpers/data_processing.py +++ b/sources/google_sheets/helpers/data_processing.py @@ -1,10 +1,12 @@ """This is a helper module that contains function which validate and process data""" +import re +from typing import Any, Iterator, List, Tuple, Union, NamedTuple -from typing import Any, Dict, Iterator, List, Tuple, Union -from re import match +import dlt from dlt.common import logger, pendulum from dlt.common.typing import DictStrAny +from dlt.common.data_types import TDataType # this string comes before the id URL_ID_IDENTIFIER = "d" @@ -14,6 +16,64 @@ DLT_TIMEZONE = "UTC" # number of seconds from UNIX timestamp origin (1st Jan 1970) to serial number origin (30th Dec 1899) TIMESTAMP_CONST = -2209161600.0 +# compiled regex to extract ranges +RE_PARSE_RANGE = re.compile( + r"^(?:(?P[\'\w\s]+)!)?(?P[A-Z]+)(?P\d+):(?P[A-Z]+)(?P\d+)$" +) + + +class ParsedRange(NamedTuple): + sheet_name: str + start_col: str + start_row: int + end_col: str + end_row: int + + @classmethod + def parse_range(cls, s: str) -> "ParsedRange": + match = RE_PARSE_RANGE.match(s) + if match: + parsed_dict = match.groupdict() + return ParsedRange( + parsed_dict["sheet"].strip("'"), + parsed_dict["start_col"], + int(parsed_dict["start_row"]), + parsed_dict["end_col"], + int(parsed_dict["end_row"]), + ) + else: + raise ValueError(s) + + def __str__(self) -> str: + return f"{self.sheet_name}!{self.start_col}{self.start_row}:{self.end_col}{self.end_row}" + + @staticmethod + def shift_column(col: str, shift: int) -> str: + """ + Shift a Google Sheets column string by a given number of positions. + + Parameters: + col (str): The original column string. + shift (int): The number of positions to shift the column. + + Returns: + str: The new column string after shifting. + """ + # Convert column string to column index (1-indexed) + col_num = 0 + for i, char in enumerate(reversed(col)): + col_num += (ord(char.upper()) - 65 + 1) * (26**i) + + # Shift the column index + col_num += shift + + # Convert back to column string + col_str = "" + while col_num > 0: + col_num, remainder = divmod(col_num - 1, 26) + col_str = chr(65 + remainder) + col_str + + return col_str def get_spreadsheet_id(url_or_id: str) -> str: @@ -28,16 +88,16 @@ def get_spreadsheet_id(url_or_id: str) -> str: """ # check if this is an url: http or https in it - if match(r"http://|https://", url_or_id): + if re.match(r"http://|https://", url_or_id): # process url - spreadsheet_id = process_url(url_or_id) + spreadsheet_id = extract_spreadsheet_id_from_url(url_or_id) return spreadsheet_id else: # just return id return url_or_id -def process_url(url: str) -> str: +def extract_spreadsheet_id_from_url(url: str) -> str: """ Takes a URL to a Google spreadsheet and computes the spreadsheet ID from it according to the spreadsheet URL formula: https://docs.google.com/spreadsheets/d//edit. If the URL is not formatted correctly, a ValueError will be raised. @@ -56,354 +116,182 @@ def process_url(url: str) -> str: parts = url.split("/") # loop through parts for i in range(len(parts)): - # if we find if parts[i] == URL_ID_IDENTIFIER and i + 1 < len(parts): # if the id part is left empty then the url is not formatted correctly if parts[i + 1] == "": - raise ValueError("Spreadsheet ID is an empty string") + raise ValueError(f"Spreadsheet ID is an empty string in url: {url}") else: return parts[i + 1] - # if url cannot be found, raise error - raise ValueError("Invalid URL. Cannot find spreadsheet ID") - - -def get_first_rows(sheet_range: str) -> List[str]: - """ - Receives the range of a Google sheet, parses it and outputs the sheet name, a range which includes the first 2 rows only. - Is used for only getting the first 2 rows when collecting metadata. + raise ValueError(f"Invalid URL. Cannot find spreadsheet ID in url: {url}") - Args: - sheet_range (str): Range of a Google sheet. Example: sheet1, sheet3!G18:O28. General formula {sheet_name}![Starting_column][Starting_row]:[Ending_column]:[Ending_row] - Returns: - List[str]: List containing the sheet name and the modified range to only have the first 2 rows. - """ - - # split on the ! - sheet_parts = sheet_range.split("!") - sheet_name = sheet_parts[0] - # the range can either have 1 or 2 parts: 1 part if it is simply a sheet name or 2 parts if it is an A1 range - if len(sheet_parts) == 1: - return [sheet_name, f"{sheet_name}!1:2"] - elif len(sheet_parts) > 2: - raise ValueError( - "Range format is incorrect! Check documentation for correct usage." - ) - - range_name = sheet_parts[1] - # split on the :, expecting strings in the form start:end, i.e 2 parts after splitting on the : - # separate row and column letters from both the range start and end - range_parts = range_name.split(":") - if len(range_parts) != 2: - raise ValueError( - "Range format is incorrect! Check documentation for correct usage." - ) - starting_row, starting_col = _separate_row_col(range_parts[0]) - ending_row, ending_col = _separate_row_col(range_parts[1]) - - # handle possible edge cases/errors - # start_col:end_col format - if not starting_row: - starting_row = "1" - # handle parsing errors and determine new end row - try: - ending_row = str(int(starting_row) + 1) - except ValueError: - raise ValueError(f"Crashed while reading range part: {range_parts[0]}") - return [ - sheet_name, - f"{sheet_name}!{starting_col}{starting_row}:{ending_col}{ending_row}", - ] - - -def _separate_row_col(row_col_str: str) -> Tuple[str, str]: - """ - Helper function that receives a row and column together from the A1 range and returns the row and column separately. - - Args: - row_col_str (str): Row and column together from the A1 range. Example: "A1", "BB2", "ZZ25", etc. - - Returns: - Tuple[str, str]: Row and column separately. Example: ("A", "1"), etc. - """ - range_row = "" - range_col = "" - for range_char in row_col_str: - if range_char.isdigit(): - range_row += range_char - else: - range_col += range_char - return range_row, range_col - - -def convert_named_range_to_a1( - named_range_dict: DictStrAny, sheet_names_dict: Dict[str, DictStrAny] = None -) -> str: - """ - Converts a named_range dict returned from Google Sheets API metadata call to an A1 range. - - Args: - named_range_dict (DictStrAny): Dict returned from Google Sheets API, containing information about a named range. - sheet_names_dict (Dict[str, DictStrAny], optional): Dict containing all the sheets inside the spreadsheet where the sheet id is the key and the sheet name is the corresponding value. - - Returns: - str: A string representing the named range as an A1 range. - """ - if not sheet_names_dict: - sheet_names_dict = {} - start_row_idx = named_range_dict["range"]["startRowIndex"] - end_row_idx = named_range_dict["range"]["endRowIndex"] - start_col_idx = named_range_dict["range"]["startColumnIndex"] - end_col_idx = named_range_dict["range"]["endColumnIndex"] - - # get sheet name from sheet_names_dict - sheet_id = named_range_dict["range"]["sheetId"] - named_range_sheet = sheet_names_dict[sheet_id] - - # convert columns from index to letters - start_col_letter = _convert_col_a1(start_col_idx) - end_col_letter = _convert_col_a1(end_col_idx - 1) - - # For some reason the end row index is 1 row beyond the actual stopping point, - # meaning we don't have to add 1 to convert to row number - return f"{named_range_sheet}!{start_col_letter}{start_row_idx+1}:{end_col_letter}{end_row_idx}" - - -def _convert_col_a1(col_idx: int) -> str: - """ - Helper, converts a column index to a column letter in accordance with Google Sheets - @:param: col_idx - index of column - @:return: col_name - name of a column - """ - letters = [ - "", - "A", - "B", - "C", - "D", - "E", - "F", - "G", - "H", - "I", - "J", - "K", - "L", - "M", - "N", - "O", - "P", - "Q", - "R", - "S", - "T", - "U", - "V", - "W", - "X", - "Y", - "Z", - ] - col_name = "" - while col_idx > 0: - col_idx, remainder = divmod(col_idx, 26) - if col_name: - # edge case - columns of 2 or more letters that start with the letter Z - if remainder == 0: - remainder = 26 - col_idx = col_idx - 1 - col_name = letters[remainder] + col_name - else: - col_name = letters[remainder + 1] + col_name - return col_name or "A" - - -def get_range_headers(range_metadata: List[DictStrAny], range_name: str) -> List[str]: +def get_range_headers(headers_metadata: List[DictStrAny], range_name: str) -> List[str]: """ Retrieves the headers for columns from the metadata of a range. Args: - range_metadata (List[DictStrAny]): Metadata for the first 2 rows of a range. + headers_metadata (List[DictStrAny]): Metadata for the first 2 rows of a range. range_name (str): The name of the range as appears in the metadata. Returns: List[str]: A list of headers. """ headers = [] - empty_header_index = 0 - for header in range_metadata[0]["values"]: + for idx, header in enumerate(headers_metadata): + header_val: str = None if header: - header_val = header["formattedValue"] - # warn user when reading non string values as header - metadata is the only place to check this info - if not ("stringValue" in header["effectiveValue"]): + if "stringValue" in header["effectiveValue"]: + header_val = header["formattedValue"] + else: logger.warning( - f"In range {range_name}, header value: {header_val} is not a string! Name changed when loaded in database!" + f"In range {range_name}, header value: {header['formattedValue']} is not a string!" ) - headers.append(header_val) + return None else: - headers.append(f"empty_header_filler{empty_header_index}") - empty_header_index = empty_header_index + 1 - # report if a header was empty - if empty_header_index: + logger.warning( + f"In range {range_name}, header at position {idx+1} is not missing!" + ) + return None + headers.append(header_val) + + # make sure that headers are unique, first normalize the headers + header_mappings = { + h: dlt.current.source_schema().naming.normalize_identifier(h) for h in headers + } + if len(set(header_mappings.values())) != len(headers): logger.warning( - f"In range {range_name}, {empty_header_index} headers were found empty!" + "Header names must be unique otherwise you risk that data in columns with duplicate header names to be lost. Note that several destinations require " + + "that column names are normalized ie. must be lower or upper case and without special characters. dlt normalizes those names for you but it may " + + f"result in duplicate column names. Headers in range {range_name} are mapped as follows: " + + ", ".join([f"{k}->{v}" for k, v in header_mappings.items()]) + + ". Please use make your header names unique." ) - # manage headers being empty - if len(headers) == empty_header_index: - return [] + return None + return headers -def get_first_line(range_metadata: List[DictStrAny]) -> List[bool]: +def get_data_types(data_row_metadata: List[DictStrAny]) -> List[TDataType]: """ Determines if each column in the first line of a range contains datetime objects. Args: - range_metadata (List[DictStrAny]): Metadata for the first 2 rows in a range. + data_row_metadata (List[DictStrAny]): Metadata of the first row of data Returns: - List[bool]: A list of boolean values indicating whether each column in the first line contains datetime objects. + List[TDataType]: "timestamp" or "data" indicating the date/time type for a column, otherwise None """ # get data for 1st column and process them, if empty just return an empty list try: - is_datetime_cols = is_date_datatype(range_metadata[1]["values"]) + data_types: List[TDataType] = [None] * len(data_row_metadata) + for idx, val_dict in enumerate(data_row_metadata): + try: + data_type = val_dict["effectiveFormat"]["numberFormat"]["type"] + if data_type in ["DATE_TIME", "TIME"]: + data_types[idx] = "timestamp" + elif data_type == "DATE": + data_types[idx] = "date" + except KeyError: + pass + return data_types except IndexError: return [] - return is_datetime_cols - - -def is_date_datatype(value_list: List[DictStrAny]) -> List[bool]: - """ - Determines if each value in a list is a datetime object. - - Args: - value_list (List[DictStrAny]): A list of values from the first row of data returned by Google Sheets API. - - Returns: - List[bool]: A list of boolean values indicating whether each value is a datetime object. - """ - - value_type_list = [] - # loop through the list and process each value dict, decide if something is a datetime value or not - for val_dict in value_list: - try: - is_date_type = "DATE" in val_dict["effectiveFormat"]["numberFormat"]["type"] - is_time_type = "TIME" in val_dict["effectiveFormat"]["numberFormat"]["type"] - is_date = is_date_type or is_time_type - except KeyError: - is_date = False - value_type_list.append(is_date) - return value_type_list def serial_date_to_datetime( - serial_number: Union[int, float, str, bool] -) -> Union[pendulum.DateTime, str, bool]: + serial_number: Union[int, float], data_type: TDataType +) -> Union[pendulum.DateTime, pendulum.Date]: """ - Converts a serial number to a datetime object. + Converts a serial number to a datetime (if input is float) or date (if input is int). Args: - serial_number (Union[int, float, str, bool]): The serial number, which can be an int, float, bool, or str. + serial_number (Union[int, float, str, bool]): The Lotus Notes serial number Returns: Union[pendulum.DateTime, str, bool]: The converted datetime object, or the original value if conversion fails. """ - - # if called with a different data type, return with whatever input was, handled by the dlt source later - edge case - if not isinstance(serial_number, (int, float)): - return serial_number # To get the seconds passed since the start date of serial numbers we round the product of the number of seconds in a day and the serial number conv_datetime: pendulum.DateTime = pendulum.from_timestamp( 0, DLT_TIMEZONE ) + pendulum.duration( seconds=TIMESTAMP_CONST + round(SECONDS_PER_DAY * serial_number) ) - return conv_datetime - - -def metadata_preprocessing( - ranges: List[str], named_ranges: DictStrAny = None -) -> Tuple[List[str], Dict[str, List[DictStrAny]]]: - """ - Helper function that iterates through the input ranges and processes them so that only the first 2 rows are returned per range. - It also structures all the ranges inside a dictionary similar to how they are returned by the Google Sheets API metadata request. - - Args: - ranges (List[str]): List of range names. - named_ranges (DictStrAny, optional): Dictionary containing ranges as keys and the corresponding named ranges as values. - - Returns: - Tuple[List[str], Dict[str, List[DictStrAny]]]: A tuple containing: - - meta_ranges: List containing all the ranges where metadata is gathered from. - - response_like_dict: A dictionary that mirrors the structure of the Google Sheets API response. - The keys are the parent sheets, and the values are lists of dictionaries containing metadata for each range. - """ + # int values are dates, float values are datetimes + if data_type == "date": + return conv_datetime.date() # type: ignore - # process metadata ranges so only the first 2 rows are appended - # response like dict will contain a dict similar to the response by the Google Sheets API: ranges are returned inside the sheets they belong in the order given in the API request. - meta_ranges = [] - response_like_dict: Dict[str, List[DictStrAny]] = {} - for requested_range in ranges: - # Metadata ranges have a different range-only first 2 rows, so we need to convert to those ranges first - # convert range to first 2 rows, getting the sheet name and the range that has only the first 2 rows - range_info = get_first_rows(requested_range) - sheet_name = range_info[0] - # initialize the dict containing information about the metadata of this range, headers contains the names of all header columns and - # cols_is_date contains booleans indicating whether the data expected in that column is a datetime object or not. - unfilled_range_dict = { - "range": requested_range, - "headers": [], - "cols_is_datetime": [], - } - # try to fill information about range name if the range has a name by checking named ranges - try: - unfilled_range_dict["name"] = named_ranges[requested_range] - except (KeyError, TypeError): - unfilled_range_dict["name"] = None - # All the information in the dict is properly set up, now we just need to store it in the response_like_dict - try: - response_like_dict[sheet_name].append(unfilled_range_dict) - except KeyError: - response_like_dict[sheet_name] = [unfilled_range_dict] - meta_ranges.append(range_info[1]) - return meta_ranges, response_like_dict + return conv_datetime def process_range( - sheet_val: List[List[Any]], sheet_meta: DictStrAny + sheet_values: List[List[Any]], headers: List[str], data_types: List[TDataType] ) -> Iterator[DictStrAny]: """ - Receives 2 arrays of tabular data inside a sheet. This will be processed into a schema that is later stored into a database table. + Yields lists of values as dictionaries, converts data times and handles empty rows and cells. Please note: + 1. empty rows get ignored + 2. empty cells are converted to None (and then to NULL by dlt) + 3. data in columns without headers will be dropped Args: - sheet_val (List[List[Any]]): 2D array of values. - sheet_meta (DictStrAny): Metadata gathered for this specific range. + sheet_val (List[List[Any]]): range values without the header row + headers (List[str]): names of the headers + data_types: List[TDataType]: "timestamp" and "date" or None for each column Yields: DictStrAny: A dictionary version of the table. It generates a dictionary of the type {header: value} for every row. """ - # get headers and first line data types which is just Datetime or not Datetime so far and loop through the remaining values - headers = sheet_meta["headers"] - first_line_val_types = sheet_meta["cols_is_datetime"] - # edge case - only 1 line of data, load the empty tables - if len(sheet_val) == 1: - yield {header: "" for header in headers} - # otherwise loop through the other rows and return data normally - for row in sheet_val[1:]: - table_dict = {} + + for row in sheet_values: # empty row; skip if not row: continue + table_dict = {} # process both rows and check for differences to spot dates - for val, header, is_datetime in zip(row, headers, first_line_val_types): + for val, header, data_type in zip(row, headers, data_types): # 3 main cases: null cell value, datetime value, every other value # handle null values properly. Null cell values are returned as empty strings, this will cause dlt to create new columns and fill them with empty strings if val == "": fill_val = None - elif is_datetime: - fill_val = serial_date_to_datetime(val) + elif data_type in ["timestamp", "date"]: + # the datetimes are inferred from first row of data. if next rows have inconsistent data types - pass the values to dlt to deal with them + if not isinstance(val, (int, float)) or isinstance(val, bool): + fill_val = val + else: + fill_val = serial_date_to_datetime(val, data_type) else: fill_val = val table_dict[header] = fill_val yield table_dict + + +def trim_range_top_left( + parsed_range: ParsedRange, range_values: List[List[Any]] +) -> Tuple[ParsedRange, List[List[Any]]]: + # skip empty rows and then empty columns + # skip empty rows + shift_x = 0 + for row in range_values: + if row: + break + else: + shift_x += 1 + if shift_x > 0: + range_values = range_values[shift_x:] + # skip empty columns + shift_y = 0 + if len(range_values) > 0: + for col in range_values[0]: + if col == "": + shift_y += 1 + else: + break + if shift_y > 0: + # skip all columns + for idx, row in enumerate(range_values): + range_values[idx] = row[shift_y:] + parsed_range = parsed_range._replace( + start_row=parsed_range.start_row + shift_x, + start_col=ParsedRange.shift_column(parsed_range.start_col, shift_y), + ) + return parsed_range, range_values diff --git a/sources/google_sheets_pipeline.py b/sources/google_sheets_pipeline.py index e36ebab42..f98869173 100644 --- a/sources/google_sheets_pipeline.py +++ b/sources/google_sheets_pipeline.py @@ -4,63 +4,77 @@ def load_pipeline_with_ranges() -> None: """ - Does a full pipeline run. Will load all ranges in config.toml. The dlt config also contains the spreadsheet url or id that data will be loaded from. + Loads explicitly passed ranges """ pipeline = dlt.pipeline( pipeline_name="google_sheets_pipeline", - destination="postgres", + destination="duckdb", full_refresh=False, dataset_name="test", ) - data = google_spreadsheet(get_sheets=False, get_named_ranges=False) + data = google_spreadsheet( + "https://docs.google.com/spreadsheets/d/1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", + range_names=["NamedRange1", "Sheet 1", "Sheet 1!A1:D4"], + get_sheets=False, + get_named_ranges=False, + ) info = pipeline.run(data) print(info) def load_pipeline_with_sheets() -> None: """ - Does a pipeline run. Will load all the sheets in the spreadsheet, but it will not load any of the named ranges in the spreadsheet. Will also load all the ranges given in config. - The dlt config also contains the spreadsheet url or id that data will be loaded from. + Does a pipeline run. Will load all the sheets in the spreadsheet, but it will not load any of the named ranges in the spreadsheet. """ pipeline = dlt.pipeline( pipeline_name="google_sheets_pipeline", - destination="postgres", + destination="duckdb", full_refresh=False, dataset_name="sample_google_sheet_data", ) - data = google_spreadsheet(get_sheets=True, get_named_ranges=False) + data = google_spreadsheet( + "1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580", + get_sheets=True, + get_named_ranges=False, + ) info = pipeline.run(data) print(info) def load_pipeline_with_named_ranges() -> None: """ - Does a pipeline run. Will not load the sheets in the spreadsheet, but it will load all the named ranges in the spreadsheet. Will also load all the ranges given in config. - The dlt config also contains the spreadsheet url or id that data will be loaded from. + Does a pipeline run. Will not load the sheets in the spreadsheet, but it will load all the named ranges in the spreadsheet. """ pipeline = dlt.pipeline( pipeline_name="google_sheets_pipeline", - destination="postgres", + destination="duckdb", full_refresh=False, dataset_name="sample_google_sheet_data", ) - data = google_spreadsheet(get_sheets=False, get_named_ranges=True) + data = google_spreadsheet( + "1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580", + get_sheets=False, + get_named_ranges=True, + ) info = pipeline.run(data) print(info) def load_pipeline_with_sheets_and_ranges() -> None: """ - Does a pipeline run. Will load all the sheets in the spreadsheet and all the named ranges in the spreadsheet. Will also load all the ranges given in config. - The dlt config also contains the spreadsheet url or id that data will be loaded from. + Does a pipeline run. Will load all the sheets in the spreadsheet and all the named ranges in the spreadsheet. """ pipeline = dlt.pipeline( pipeline_name="google_sheets_pipeline", - destination="postgres", - full_refresh=False, + destination="duckdb", + full_refresh=True, dataset_name="sample_google_sheet_data", ) - data = google_spreadsheet(get_sheets=True, get_named_ranges=True) + data = google_spreadsheet( + "1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580", + get_sheets=True, + get_named_ranges=True, + ) info = pipeline.run(data) print(info) diff --git a/sources/shopify_dlt/__init__.py b/sources/shopify_dlt/__init__.py index 080375a75..7b0f151a2 100644 --- a/sources/shopify_dlt/__init__.py +++ b/sources/shopify_dlt/__init__.py @@ -5,7 +5,8 @@ import dlt from dlt.extract.source import DltResource -from dlt.common.typing import TDataItem +from dlt.common.typing import TDataItem, TAnyDateTime +from dlt.common.time import ensure_pendulum_datetime from dlt.common import pendulum from .settings import ( @@ -14,7 +15,6 @@ DEFAULT_ITEMS_PER_PAGE, ) from .helpers import ShopifyApi, TOrderStatus -from .date_helper import TAnyDateTime, ensure_pendulum_datetime @dlt.source(name="shopify") @@ -34,6 +34,7 @@ def shopify_source( `start_time` argument can be used on its own or together with `end_time`. When both are provided data is limited to items updated in that time range. The range is "half-open", meaning elements equal and newer than `start_time` and elements older than `end_time` are included. + All resources opt-in to use Airflow scheduler if run as Airflow task Args: private_app_password: The app password to the app on your shop. @@ -66,7 +67,10 @@ def products( updated_at: dlt.sources.incremental[ pendulum.DateTime ] = dlt.sources.incremental( - "updated_at", initial_value=start_date_obj, end_value=end_date_obj + "updated_at", + initial_value=start_date_obj, + end_value=end_date_obj, + allow_external_schedulers=True, ), created_at_min: pendulum.DateTime = created_at_min_obj, items_per_page: int = items_per_page, @@ -95,7 +99,10 @@ def orders( updated_at: dlt.sources.incremental[ pendulum.DateTime ] = dlt.sources.incremental( - "updated_at", initial_value=start_date_obj, end_value=end_date_obj + "updated_at", + initial_value=start_date_obj, + end_value=end_date_obj, + allow_external_schedulers=True, ), created_at_min: pendulum.DateTime = created_at_min_obj, items_per_page: int = items_per_page, @@ -126,7 +133,10 @@ def customers( updated_at: dlt.sources.incremental[ pendulum.DateTime ] = dlt.sources.incremental( - "updated_at", initial_value=start_date_obj, end_value=end_date_obj + "updated_at", + initial_value=start_date_obj, + end_value=end_date_obj, + allow_external_schedulers=True, ), created_at_min: pendulum.DateTime = created_at_min_obj, items_per_page: int = items_per_page, diff --git a/sources/shopify_dlt/date_helper.py b/sources/shopify_dlt/date_helper.py deleted file mode 100644 index ed0e85ba6..000000000 --- a/sources/shopify_dlt/date_helper.py +++ /dev/null @@ -1,35 +0,0 @@ -from typing import Union, Optional -from datetime import datetime, date # noqa: I251 - -from dlt.common import pendulum -from dlt.common.time import parse_iso_like_datetime - - -TAnyDateTime = Union[pendulum.DateTime, pendulum.Date, datetime, date, str] - - -def ensure_pendulum_datetime(value: TAnyDateTime) -> pendulum.DateTime: - """Coerce a date/time value to a `pendulum.DateTime` object. - - UTC is assumed if the value is not timezone aware. - - Args: - value: The value to coerce. Can be a pendulum.DateTime, pendulum.Date, datetime, date or iso date/time str. - - Returns: - A timezone aware pendulum.DateTime object. - """ - if isinstance(value, datetime): - # both py datetime and pendulum datetime are handled here - ret = pendulum.instance(value) - if ret.tz is None: - return ret.in_tz("UTC") - return ret - elif isinstance(value, date): - return pendulum.datetime(value.year, value.month, value.day) - elif isinstance(value, str): - result = parse_iso_like_datetime(value) - if not isinstance(result, datetime): # TODO: iso date parses to date object - return pendulum.datetime(result.year, result.month, result.day) - return result - raise TypeError(f"Cannot coerce {value} to a pendulum.DateTime object.") diff --git a/sources/shopify_dlt/helpers.py b/sources/shopify_dlt/helpers.py index 00c6106c5..4876bef57 100644 --- a/sources/shopify_dlt/helpers.py +++ b/sources/shopify_dlt/helpers.py @@ -1,12 +1,12 @@ """Shopify source helpers""" from urllib.parse import urljoin +from dlt.common.time import ensure_pendulum_datetime from dlt.sources.helpers import requests from dlt.common.typing import TDataItem, TDataItems, Dict from typing import Any, Iterable, Optional, Literal from .settings import DEFAULT_API_VERSION -from .date_helper import ensure_pendulum_datetime TOrderStatus = Literal["open", "closed", "cancelled", "any"] diff --git a/sources/shopify_dlt/requirements.txt b/sources/shopify_dlt/requirements.txt index 333a677b7..291a40188 100644 --- a/sources/shopify_dlt/requirements.txt +++ b/sources/shopify_dlt/requirements.txt @@ -1 +1 @@ -dlt>=0.3.5,<0.4.0 +dlt>=0.3.8,<0.4.0 diff --git a/sources/zendesk/__init__.py b/sources/zendesk/__init__.py index abbe38cad..612135802 100644 --- a/sources/zendesk/__init__.py +++ b/sources/zendesk/__init__.py @@ -7,20 +7,21 @@ import dlt from dlt.common import pendulum -from dlt.common.time import parse_iso_like_datetime -from dlt.common.typing import TDataItem, TDataItems +from dlt.common.time import ensure_pendulum_datetime +from dlt.common.typing import TDataItem, TDataItems, TAnyDateTime from dlt.extract.source import DltResource from .helpers.api_helpers import process_ticket, process_ticket_field -from .helpers.talk_api import ZendeskAPIClient +from .helpers.talk_api import PaginationType, ZendeskAPIClient from .helpers.credentials import TZendeskCredentials, ZendeskCredentialsOAuth from .helpers import make_date_ranges from .settings import ( DEFAULT_START_DATE, CUSTOM_FIELDS_STATE_KEY, + SUPPORT_ENDPOINTS, TALK_ENDPOINTS, - INCREMENTAL_ENDPOINTS, + INCREMENTAL_TALK_ENDPOINTS, SUPPORT_EXTRA_ENDPOINTS, ) @@ -28,20 +29,21 @@ @dlt.source(max_table_nesting=2) def zendesk_talk( credentials: TZendeskCredentials = dlt.secrets.value, - start_time: Optional[pendulum.DateTime] = DEFAULT_START_DATE, - end_time: Optional[pendulum.DateTime] = None, + start_date: Optional[TAnyDateTime] = DEFAULT_START_DATE, + end_date: Optional[TAnyDateTime] = None, ) -> Iterable[DltResource]: """ Retrieves data from Zendesk Talk for phone calls and voicemails. - `start_time` argument can be used on its own or together with `end_time`. When both are provided + `start_date` argument can be used on its own or together with `end_date`. When both are provided data is limited to items updated in that time range. - The range is "half-open", meaning elements equal and higher than `start_time` and elements lower than `end_time` are included. + The range is "half-open", meaning elements equal and higher than `start_date` and elements lower than `end_date` are included. + All resources opt-in to use Airflow scheduler if run as Airflow task Args: credentials: The credentials for authentication. Defaults to the value in the `dlt.secrets` object. - start_time: The start time of the range for which to load. Defaults to January 1st 2000. - end_time: The end time of the range for which to load data. + start_date: The start time of the range for which to load. Defaults to January 1st 2000. + end_date: The end time of the range for which to load data. If end time is not provided, the incremental loading will be enabled and after initial run, only new data will be retrieved Yields: DltResource: Data resources from Zendesk Talk. @@ -49,21 +51,24 @@ def zendesk_talk( # use the credentials to authenticate with the ZendeskClient zendesk_client = ZendeskAPIClient(credentials) + start_date_obj = ensure_pendulum_datetime(start_date) + end_date_obj = ensure_pendulum_datetime(end_date) if end_date else None # regular endpoints - for key, talk_endpoint in TALK_ENDPOINTS.items(): + for key, talk_endpoint, item_name, cursor_paginated in TALK_ENDPOINTS: yield dlt.resource( talk_resource( - zendesk_client=zendesk_client, - talk_endpoint_name=key, - talk_endpoint=talk_endpoint, + zendesk_client, + key, + item_name or talk_endpoint, + PaginationType.CURSOR if cursor_paginated else PaginationType.OFFSET, ), name=key, write_disposition="replace", ) # adding incremental endpoints - for key, talk_incremental_endpoint in INCREMENTAL_ENDPOINTS.items(): + for key, talk_incremental_endpoint in INCREMENTAL_TALK_ENDPOINTS.items(): yield dlt.resource( talk_incremental_resource, name=f"{key}_incremental", @@ -73,16 +78,20 @@ def zendesk_talk( zendesk_client=zendesk_client, talk_endpoint_name=key, talk_endpoint=talk_incremental_endpoint, - updated_at=dlt.sources.incremental( + updated_at=dlt.sources.incremental[str]( "updated_at", - initial_value=start_time.isoformat(), - end_value=end_time.isoformat() if end_time else None, + initial_value=start_date_obj.isoformat(), + end_value=end_date_obj.isoformat() if end_date_obj else None, + allow_external_schedulers=True, ), ) def talk_resource( - zendesk_client: ZendeskAPIClient, talk_endpoint_name: str, talk_endpoint: str + zendesk_client: ZendeskAPIClient, + talk_endpoint_name: str, + talk_endpoint: str, + pagination_type: PaginationType, ) -> Iterator[TDataItem]: """ Loads data from a Zendesk Talk endpoint. @@ -91,13 +100,14 @@ def talk_resource( zendesk_client: An instance of ZendeskAPIClient for making API calls to Zendesk Talk. talk_endpoint_name: The name of the talk_endpoint. talk_endpoint: The actual URL ending of the endpoint. + pagination: Type of pagination type used by endpoint Yields: TDataItem: Dictionary containing the data from the endpoint. """ # send query and process it yield from zendesk_client.get_pages( - endpoint=talk_endpoint, data_point_name=talk_endpoint_name + talk_endpoint, talk_endpoint_name, pagination_type ) @@ -120,10 +130,13 @@ def talk_incremental_resource( TDataItem: Dictionary containing the data from the endpoint. """ # send the request and process it - for page in zendesk_client.get_pages_incremental( - endpoint=talk_endpoint, - data_point_name=talk_endpoint_name, - start_time=parse_iso_like_datetime(updated_at.last_value).int_timestamp, + for page in zendesk_client.get_pages( + talk_endpoint, + talk_endpoint_name, + PaginationType.START_TIME, + params={ + "start_time": ensure_pendulum_datetime(updated_at.last_value).int_timestamp + }, ): yield page if updated_at.end_out_of_range: @@ -133,20 +146,21 @@ def talk_incremental_resource( @dlt.source(max_table_nesting=2) def zendesk_chat( credentials: ZendeskCredentialsOAuth = dlt.secrets.value, - start_time: Optional[pendulum.DateTime] = DEFAULT_START_DATE, - end_time: Optional[pendulum.DateTime] = None, + start_date: Optional[TAnyDateTime] = DEFAULT_START_DATE, + end_date: Optional[TAnyDateTime] = None, ) -> Iterable[DltResource]: """ Retrieves data from Zendesk Chat for chat interactions. - `start_time` argument can be used on its own or together with `end_time`. When both are provided + `start_date` argument can be used on its own or together with `end_date`. When both are provided data is limited to items updated in that time range. - The range is "half-open", meaning elements equal and higher than `start_time` and elements lower than `end_time` are included. + The range is "half-open", meaning elements equal and higher than `start_date` and elements lower than `end_date` are included. + All resources opt-in to use Airflow scheduler if run as Airflow task Args: credentials: The credentials for authentication. Defaults to the value in the `dlt.secrets` object. - start_time: The start time of the range for which to load. Defaults to January 1st 2000. - end_time: The end time of the range for which to load data. + start_date: The start time of the range for which to load. Defaults to January 1st 2000. + end_date: The end time of the range for which to load data. If end time is not provided, the incremental loading will be enabled and after initial run, only new data will be retrieved Yields: @@ -155,18 +169,23 @@ def zendesk_chat( # Authenticate zendesk_client = ZendeskAPIClient(credentials, url_prefix="https://www.zopim.com") - yield dlt.resource(chats_table_resource, name="chats", write_disposition="append")( + start_date_obj = ensure_pendulum_datetime(start_date) + end_date_obj = ensure_pendulum_datetime(end_date) if end_date else None + + yield dlt.resource(chats_table_resource, name="chats", write_disposition="merge")( zendesk_client, - dlt.sources.incremental( + dlt.sources.incremental[str]( "update_timestamp|updated_timestamp", - initial_value=start_time.isoformat(), - end_value=end_time.isoformat() if end_time else None, + initial_value=start_date_obj.isoformat(), + end_value=end_date_obj.isoformat() if end_date_obj else None, + allow_external_schedulers=True, ), ) def chats_table_resource( - zendesk_client: ZendeskAPIClient, update_timestamp: dlt.sources.incremental[str] + zendesk_client: ZendeskAPIClient, + update_timestamp: dlt.sources.incremental[str], ) -> Iterator[TDataItems]: """ Resource for Chats @@ -178,11 +197,16 @@ def chats_table_resource( Yields: dict: A dictionary representing each row of data. """ - chat_pages = zendesk_client.get_pages_incremental( + chat_pages = zendesk_client.get_pages( "/api/v2/incremental/chats", "chats", - start_time=parse_iso_like_datetime(update_timestamp.last_value).int_timestamp, - params={"fields": "chats(*)"}, + PaginationType.START_TIME, + params={ + "start_time": ensure_pendulum_datetime( + update_timestamp.last_value + ).int_timestamp, + "fields": "chats(*)", + }, ) for page in chat_pages: yield page @@ -196,51 +220,57 @@ def zendesk_support( credentials: TZendeskCredentials = dlt.secrets.value, load_all: bool = True, pivot_ticket_fields: bool = True, - start_time: Optional[pendulum.DateTime] = DEFAULT_START_DATE, - end_time: Optional[pendulum.DateTime] = None, + start_date: Optional[TAnyDateTime] = DEFAULT_START_DATE, + end_date: Optional[TAnyDateTime] = None, ) -> Iterable[DltResource]: """ Retrieves data from Zendesk Support for tickets, users, brands, organizations, and groups. - `start_time` argument can be used on its own or together with `end_time`. When both are provided + `start_date` argument can be used on its own or together with `end_date`. When both are provided data is limited to items updated in that time range. - The range is "half-open", meaning elements equal and higher than `start_time` and elements lower than `end_time` are included. + The range is "half-open", meaning elements equal and higher than `start_date` and elements lower than `end_date` are included. + All resources opt-in to use Airflow scheduler if run as Airflow task Args: credentials: The credentials for authentication. Defaults to the value in the `dlt.secrets` object. load_all: Whether to load extra resources for the API. Defaults to True. pivot_ticket_fields: Whether to pivot the custom fields in tickets. Defaults to True. - start_time: The start time of the range for which to load. Defaults to January 1st 2000. - end_time: The end time of the range for which to load data. + start_date: The start time of the range for which to load. Defaults to January 1st 2000. + end_date: The end time of the range for which to load data. If end time is not provided, the incremental loading will be enabled and after initial run, only new data will be retrieved Returns: Sequence[DltResource]: Multiple dlt resources. """ - start_time_ts = start_time.int_timestamp - start_time_iso_str = start_time.isoformat() - end_time_ts: Optional[int] = None - end_time_iso_str: Optional[str] = None - if end_time: - end_time_ts = end_time.int_timestamp - end_time_iso_str = end_time.isoformat() + start_date_obj = ensure_pendulum_datetime(start_date) + end_date_obj = ensure_pendulum_datetime(end_date) if end_date else None + + start_date_ts = start_date_obj.int_timestamp + start_date_iso_str = start_date_obj.isoformat() + end_date_ts: Optional[int] = None + end_date_iso_str: Optional[str] = None + if end_date_obj: + end_date_ts = end_date_obj.int_timestamp + end_date_iso_str = end_date_obj.isoformat() @dlt.resource(primary_key="id", write_disposition="append") def ticket_events( zendesk_client: ZendeskAPIClient, timestamp: dlt.sources.incremental[int] = dlt.sources.incremental( "timestamp", - initial_value=start_time_ts, - end_value=end_time_ts, + initial_value=start_date_ts, + end_value=end_date_ts, + allow_external_schedulers=True, ), ) -> Iterator[TDataItem]: # URL For ticket events # 'https://d3v-dlthub.zendesk.com/api/v2/incremental/ticket_events.json?start_time=946684800' - event_pages = zendesk_client.get_pages_incremental( + event_pages = zendesk_client.get_pages( "/api/v2/incremental/ticket_events.json", "ticket_events", - timestamp.last_value, + PaginationType.STREAM, + params={"start_time": timestamp.last_value}, ) for page in event_pages: yield page @@ -250,7 +280,7 @@ def ticket_events( @dlt.resource( name="tickets", primary_key="id", - write_disposition="append", + write_disposition="merge", columns={ "tags": {"data_type": "complex"}, "custom_fields": {"data_type": "complex"}, @@ -259,13 +289,13 @@ def ticket_events( def ticket_table( zendesk_client: ZendeskAPIClient, pivot_fields: bool = True, - per_page: int = 1000, updated_at: dlt.sources.incremental[ pendulum.DateTime ] = dlt.sources.incremental( "updated_at", - initial_value=start_time, - end_value=end_time, + initial_value=start_date_obj, + end_value=end_date_obj, + allow_external_schedulers=True, ), ) -> Iterator[TDataItem]: """ @@ -277,7 +307,7 @@ def ticket_table( pivot_fields: Indicates whether to pivot the custom fields in tickets. Defaults to True. per_page: The number of Ticket objects to load per page. Defaults to 1000. updated_at: Incremental source for the 'updated_at' column. - Defaults to dlt.sources.incremental("updated_at", initial_value=start_time). + Defaults to dlt.sources.incremental("updated_at", initial_value=start_date). Yields: TDataItem: Dictionary containing the ticket data. @@ -286,12 +316,12 @@ def ticket_table( if pivot_fields: load_ticket_fields_state(zendesk_client) fields_dict = dlt.current.source_state().setdefault(CUSTOM_FIELDS_STATE_KEY, {}) - include_objects = ["users", "groups", "organisation", "brands"] - ticket_pages = zendesk_client.get_pages_incremental( + # include_objects = ["users", "groups", "organisation", "brands"] + ticket_pages = zendesk_client.get_pages( "/api/v2/incremental/tickets", "tickets", - updated_at.last_value.int_timestamp, - params={"include": ",".join(include_objects)}, + PaginationType.STREAM, + params={"start_time": updated_at.last_value.int_timestamp}, ) for page in ticket_pages: yield [ @@ -309,8 +339,9 @@ def ticket_metric_table( zendesk_client: ZendeskAPIClient, time: dlt.sources.incremental[str] = dlt.sources.incremental( "time", - initial_value=start_time_iso_str, - end_value=end_time_iso_str, + initial_value=start_date_iso_str, + end_value=end_date_iso_str, + allow_external_schedulers=True, ), ) -> Iterator[TDataItem]: """ @@ -321,19 +352,19 @@ def ticket_metric_table( zendesk_client: The Zendesk API client instance, used to make calls to Zendesk API. time: Incremental source for the 'time' column, indicating the starting date for retrieving ticket metric events. - Defaults to dlt.sources.incremental("time", initial_value=start_time_iso_str). + Defaults to dlt.sources.incremental("time", initial_value=start_date_iso_str). Yields: TDataItem: Dictionary containing the ticket metric event data. """ # "https://example.zendesk.com/api/v2/incremental/ticket_metric_events?start_time=1332034771" - # all_metric_events = zendesk_client.ticket_metric_events( - # start_time=parse_iso_like_datetime(time.last_value).int_timestamp - # ) - metric_event_pages = zendesk_client.get_pages_incremental( + metric_event_pages = zendesk_client.get_pages( "/api/v2/incremental/ticket_metric_events", "ticket_metric_events", - parse_iso_like_datetime(time.last_value).int_timestamp, + PaginationType.CURSOR, + params={ + "start_time": ensure_pendulum_datetime(time.last_value).int_timestamp, + }, ) for page in metric_event_pages: yield page @@ -358,7 +389,9 @@ def ticket_fields_table(zendesk_client: ZendeskAPIClient) -> Iterator[TDataItem] # get all custom fields and update state if needed, otherwise just load dicts into tables all_fields = list( chain.from_iterable( - zendesk_client.get_pages("/api/v2/ticket_fields.json", "ticket_fields") + zendesk_client.get_pages( + "/api/v2/ticket_fields.json", "ticket_fields", PaginationType.OFFSET + ) ) ) # all_fields = zendesk_client.ticket_fields() @@ -387,23 +420,14 @@ def load_ticket_fields_state( ] # other tables to be loaded - # Tuple of resource_name, endpoint url, Optional[data_key] - resources_to_be_loaded: List[Tuple[str, str, Optional[str]]] = [ - ("users", "/api/v2/users.json", None), - ("sla_policies", "/api/v2/slas/policies.json", None), - ("groups", "/api/v2/groups.json", None), - ("organizations", "/api/v2/organizations.json", None), - ("brands", "/api/v2/brands.json", None), - ] + resources_to_be_loaded = list(SUPPORT_ENDPOINTS) # make a copy if load_all: resources_to_be_loaded.extend(SUPPORT_EXTRA_ENDPOINTS) - for resource, endpoint_url, data_key in resources_to_be_loaded: + for resource, endpoint_url, data_key, cursor_paginated in resources_to_be_loaded: resource_list.append( dlt.resource( basic_resource( - zendesk_client=zendesk_client, - endpoint_url=endpoint_url, - data_key=data_key or resource, + zendesk_client, endpoint_url, data_key or resource, cursor_paginated ), name=resource, write_disposition="replace", @@ -416,7 +440,7 @@ def basic_resource( zendesk_client: ZendeskAPIClient, endpoint_url: str, data_key: str, - per_page: int = 1000, + cursor_paginated: bool, ) -> Iterator[TDataItem]: """ Basic loader for most endpoints offered by Zenpy. Supports pagination. Expects to be called as a DLT Resource. @@ -424,12 +448,15 @@ def basic_resource( Args: zendesk_client: The Zendesk API client instance, used to make calls to Zendesk API. resource: The Zenpy endpoint to retrieve data from, usually directly linked to a Zendesk API endpoint. - per_page: The number of resources to retrieve per page. Defaults to 1000. + cursor_paginated: Tells to use CURSOR pagination or OFFSET/no pagination Yields: TDataItem: Dictionary containing the resource data. """ - params = {"per_page": per_page} - pages = zendesk_client.get_pages(endpoint_url, data_key, params) + pages = zendesk_client.get_pages( + endpoint_url, + data_key, + PaginationType.CURSOR if cursor_paginated else PaginationType.OFFSET, + ) yield from pages diff --git a/sources/zendesk/helpers/api_helpers.py b/sources/zendesk/helpers/api_helpers.py index 80235213b..d2adec122 100644 --- a/sources/zendesk/helpers/api_helpers.py +++ b/sources/zendesk/helpers/api_helpers.py @@ -1,19 +1,11 @@ -import logging from typing import Optional, TypedDict, Dict from dlt.common import pendulum, logger +from dlt.common.time import ensure_pendulum_datetime from dlt.common.typing import DictStrAny, DictStrStr, TDataItem from dlt.common.time import parse_iso_like_datetime -from .credentials import ( - ZendeskCredentialsToken, - ZendeskCredentialsEmailPass, - ZendeskCredentialsOAuth, - TZendeskCredentials, -) - - class TCustomFieldInfo(TypedDict): title: str options: DictStrStr @@ -22,7 +14,7 @@ class TCustomFieldInfo(TypedDict): def _parse_date_or_none(value: Optional[str]) -> Optional[pendulum.DateTime]: if not value: return None - return parse_iso_like_datetime(value) + return ensure_pendulum_datetime(value) def process_ticket( diff --git a/sources/zendesk/helpers/talk_api.py b/sources/zendesk/helpers/talk_api.py index 7411132b8..209dca896 100644 --- a/sources/zendesk/helpers/talk_api.py +++ b/sources/zendesk/helpers/talk_api.py @@ -1,12 +1,8 @@ -""" -This module contains everything related to the API client class made to make requests specifically to ZendeskTalk -""" - -from time import sleep -from typing import Dict, Iterator, Optional, Tuple, Union, Any -from dlt.common import logger +from enum import Enum +from typing import Dict, Iterator, Optional, Tuple, Any from dlt.common.typing import DictStrStr, TDataItems, TSecretValue from dlt.sources.helpers.requests import client +from .. import settings from .credentials import ( ZendeskCredentialsEmailPass, ZendeskCredentialsOAuth, @@ -15,6 +11,13 @@ ) +class PaginationType(Enum): + OFFSET = 0 + CURSOR = 1 + STREAM = 2 + START_TIME = 3 + + class ZendeskAPIClient: """ API client used to make requests to Zendesk talk, support and chat API @@ -61,6 +64,7 @@ def get_pages( self, endpoint: str, data_point_name: str, + pagination: PaginationType, params: Optional[Dict[str, Any]] = None, ) -> Iterator[TDataItems]: """ @@ -70,14 +74,23 @@ def get_pages( endpoint: The url to the endpoint, e.g. /api/v2/calls data_point_name: The key which data items are nested under in the response object (e.g. calls) params: Optional dict of query params to include in the request + pagination: Type of pagination type used by endpoint Returns: Generator of pages, each page is a list of dict data items """ + # update the page size to enable cursor pagination + params = params or {} + if pagination == PaginationType.CURSOR: + params["page[size]"] = settings.PAGE_SIZE + elif pagination == PaginationType.STREAM: + params["per_page"] = settings.INCREMENTAL_PAGE_SIZE + elif pagination == PaginationType.START_TIME: + params["limit"] = settings.INCREMENTAL_PAGE_SIZE + # make request and keep looping until there is no next page get_url = f"{self.url}{endpoint}" - has_more_pages = True - while has_more_pages: + while get_url: response = client.get( get_url, headers=self.headers, auth=self.auth, params=params ) @@ -85,37 +98,19 @@ def get_pages( response_json = response.json() result = response_json[data_point_name] yield result - get_url = response_json.get("next_page", None) - # Get URL includes params - params = {} - # Ticket API always returns next page URL resulting in infinite loop - # `end_of_stream` property signals there are no more pages. - # See https://developer.zendesk.com/api-reference/ticketing/ticket-management/incremental_exports/#json-format - end_of_stream = response_json.get("end_of_stream", False) - has_more_pages = bool(get_url and result) and not end_of_stream - - def get_pages_incremental( - self, - endpoint: str, - data_point_name: str, - start_time: int, - params: Optional[Dict[str, Any]] = None, - ) -> Iterator[TDataItems]: - """ - Makes a request to an incremental API endpoint - Args: - endpoint: The url to the endpoint, e.g. /api/v2/calls - data_point_name: The key which data items are nested under in the response object (e.g. calls) - start_time: a timestamp of the starting date, i.e. a date in unix epoch time (the number of seconds since January 1, 1970, 00:00:00 UTC) - params: Optional dict of query params to include in the request + get_url = None + if pagination == PaginationType.CURSOR: + if response_json["meta"]["has_more"]: + get_url = response_json["links"]["next"] + elif pagination == PaginationType.OFFSET: + get_url = response_json.get("next_page", None) + elif pagination == PaginationType.STREAM: + # See https://developer.zendesk.com/api-reference/ticketing/ticket-management/incremental_exports/#json-format + if not response_json["end_of_stream"]: + get_url = response_json["next_page"] + elif pagination == PaginationType.START_TIME: + if response_json["count"] > 0: + get_url = response_json["next_page"] - Returns: - Generator of pages, each page is a list of dict data items - """ - # start date comes as unix epoch float, need to convert to an integer to make the call to the API - params = params or {} - params["start_time"] = str(start_time) - yield from self.get_pages( - endpoint=endpoint, data_point_name=data_point_name, params=params - ) + params = {} diff --git a/sources/zendesk/requirements.txt b/sources/zendesk/requirements.txt index 4f6ae4199..61947379f 100644 --- a/sources/zendesk/requirements.txt +++ b/sources/zendesk/requirements.txt @@ -1 +1 @@ -dlt>=0.3.5,<0.4 +dlt>=0.3.8,<0.4 diff --git a/sources/zendesk/settings.py b/sources/zendesk/settings.py index 631895fd2..9abbcab7d 100644 --- a/sources/zendesk/settings.py +++ b/sources/zendesk/settings.py @@ -3,60 +3,68 @@ from dlt.common import pendulum DEFAULT_START_DATE = pendulum.datetime(year=2000, month=1, day=1) +PAGE_SIZE = 100 +INCREMENTAL_PAGE_SIZE = 1000 CUSTOM_FIELDS_STATE_KEY = "ticket_custom_fields_v2" -# Tuples of (Resource name, endpoint URL, data_key) +# Tuples of (Resource name, endpoint URL, data_key, supports pagination) # data_key is the key which data list is nested under in responses # if the data key is None it is assumed to be the same as the resource name +# The last element of the tuple says if endpoint supports cursor pagination +SUPPORT_ENDPOINTS = [ + ("users", "/api/v2/users.json", "users", True), + ("sla_policies", "/api/v2/slas/policies.json", None, False), + ("groups", "/api/v2/groups.json", None, True), + ("organizations", "/api/v2/organizations.json", None, True), + ("brands", "/api/v2/brands.json", None, True), +] + SUPPORT_EXTRA_ENDPOINTS = [ - ("activities", "/api/v2/activities.json", None), - ("automations", "/api/v2/automations.json", None), - ("custom_agent_roles", "/api/v2/custom_roles.json", "custom_roles"), - ("dynamic_content", "/api/v2/dynamic_content/items.json", "items"), - ("group_memberships", "/api/v2/group_memberships.json", None), - ("job_status", "/api/v2/job_statuses.json", "job_statuses"), - ("macros", "/api/v2/macros.json", None), - ("organization_fields", "/api/v2/organization_fields.json", None), - ("organization_memberships", "/api/v2/organization_memberships.json", None), - ("recipient_addresses", "/api/v2/recipient_addresses.json", None), - ("requests", "/api/v2/requests.json", None), - ("satisfaction_ratings", "/api/v2/satisfaction_ratings.json", None), - ("sharing_agreements", "/api/v2/sharing_agreements.json", None), - ("skips", "/api/v2/skips.json", None), - ("suspended_tickets", "/api/v2/suspended_tickets.json", None), - ("targets", "/api/v2/targets.json", None), - ("ticket_forms", "/api/v2/ticket_forms.json", None), - ("ticket_metrics", "/api/v2/ticket_metrics.json", None), - ("triggers", "/api/v2/triggers.json", None), - ("user_fields", "/api/v2/user_fields.json", None), - ("views", "/api/v2/views.json", None), - ("tags", "/api/v2/tags.json", None), + ("activities", "/api/v2/activities.json", None, True), + ("automations", "/api/v2/automations.json", None, True), + ("custom_agent_roles", "/api/v2/custom_roles.json", "custom_roles", False), + ("dynamic_content", "/api/v2/dynamic_content/items.json", "items", True), + ("group_memberships", "/api/v2/group_memberships.json", None, True), + ("job_status", "/api/v2/job_statuses.json", "job_statuses", True), + ("macros", "/api/v2/macros.json", None, True), + ("organization_fields", "/api/v2/organization_fields.json", None, True), + ("organization_memberships", "/api/v2/organization_memberships.json", None, True), + ("recipient_addresses", "/api/v2/recipient_addresses.json", None, True), + ("requests", "/api/v2/requests.json", None, True), + ("satisfaction_ratings", "/api/v2/satisfaction_ratings.json", None, True), + ("sharing_agreements", "/api/v2/sharing_agreements.json", None, False), + ("skips", "/api/v2/skips.json", None, True), + ("suspended_tickets", "/api/v2/suspended_tickets.json", None, True), + ("targets", "/api/v2/targets.json", None, False), + ("ticket_forms", "/api/v2/ticket_forms.json", None, False), + ("ticket_metrics", "/api/v2/ticket_metrics.json", None, True), + ("triggers", "/api/v2/triggers.json", None, True), + ("user_fields", "/api/v2/user_fields.json", None, True), + ("views", "/api/v2/views.json", None, True), + ("tags", "/api/v2/tags.json", None, True), ] +TALK_ENDPOINTS = [ + ("calls", "/api/v2/channels/voice/calls", None, False), + ("addresses", "/api/v2/channels/voice/addresses", None, False), + ("greeting_categories", "/api/v2/channels/voice/greeting_categories", None, False), + ("greetings", "/api/v2/channels/voice/greetings", None, False), + ("ivrs", "/api/v2/channels/voice/ivr", None, False), + ("phone_numbers", "/api/v2/channels/voice/phone_numbers", None, False), + ("settings", "/api/v2/channels/voice/settings", None, False), + ("lines", "/api/v2/channels/voice/lines", None, False), + ("agents_activity", "/api/v2/channels/voice/stats/agents_activity", None, False), + ( + "current_queue_activity", + "/api/v2/channels/voice/stats/current_queue_activity", + None, + False, + ), +] -TALK_ENDPOINTS = { - "calls": "/api/v2/channels/voice/calls", - "addresses": "/api/v2/channels/voice/addresses", - "greeting_categories": "/api/v2/channels/voice/greeting_categories", - "greetings": "/api/v2/channels/voice/greetings", - "ivrs": "/api/v2/channels/voice/ivr", - "phone_numbers": "/api/v2/channels/voice/phone_numbers", - "settings": "/api/v2/channels/voice/settings", - "lines": "/api/v2/channels/voice/lines", - "agents_activity": "/api/v2/channels/voice/stats/agents_activity", - "current_queue_activity": "/api/v2/channels/voice/stats/current_queue_activity", -} -INCREMENTAL_ENDPOINTS = { +INCREMENTAL_TALK_ENDPOINTS = { "calls": "/api/v2/channels/voice/stats/incremental/calls.json", "legs": "/api/v2/channels/voice/stats/incremental/legs.json", } - -possible_endpoints = { - "availabilities": "/api/v2/channels/voice/availabilities", - "recordings": "/api/v2/channels/voice/calls/{call_id}/recordings", - "digital_lines": "/api/v2/channels/voice/digital_lines", - "agents_overview": "/api/v2/channels/voice/stats/agents_overview", - "account_overview": "/api/v2/channels/voice/stats/account_overview", -} diff --git a/sources/zendesk_pipeline.py b/sources/zendesk_pipeline.py index 85f876f05..cac057181 100644 --- a/sources/zendesk_pipeline.py +++ b/sources/zendesk_pipeline.py @@ -49,7 +49,7 @@ def load_support_with_pivoting() -> Any: return info -def incremental_load_all_start_time() -> Any: +def incremental_load_all_start_date() -> Any: """ Implements incremental load when possible to Support, Chat and Talk Endpoints. The default behaviour gets data since the last load time saved in dlt state or 1st Jan 2000 if there has been no previous loading of the resource. With this setting, the sources will load data since the given data for all incremental endpoints. @@ -59,7 +59,7 @@ def incremental_load_all_start_time() -> Any: # Choosing starting point for incremental load - optional, the default is the last load time. If no last load time # the start time will be the 1st day of the millennium # start time needs to be a pendulum datetime object - start_time = pendulum.DateTime(year=2023, month=1, day=1).in_timezone("UTC") + start_date = pendulum.DateTime(year=2023, month=1, day=1).in_timezone("UTC") pipeline = dlt.pipeline( pipeline_name="dlt_zendesk_pipeline", @@ -67,9 +67,9 @@ def incremental_load_all_start_time() -> Any: full_refresh=False, dataset_name="sample_zendesk_data", ) - data = zendesk_support(load_all=True, start_time=start_time) - data_chat = zendesk_chat(start_time=start_time) - data_talk = zendesk_talk(start_time=start_time) + data = zendesk_support(load_all=True, start_date=start_date) + data_chat = zendesk_chat(start_date=start_date) + data_talk = zendesk_talk(start_date=start_date) info = pipeline.run(data=[data, data_chat, data_talk]) return info @@ -88,21 +88,21 @@ def incremental_load_with_backloading() -> Any: ) # Load ranges of dates to load between January 1st 2023 and today - min_start_time = pendulum.DateTime(year=2023, month=1, day=1).in_timezone("UTC") - max_end_time = pendulum.today() + min_start_date = pendulum.DateTime(year=2023, month=1, day=1).in_timezone("UTC") + max_end_date = pendulum.today() # Generate tuples of date ranges, each with 1 week in between. - ranges = make_date_ranges(min_start_time, max_end_time, timedelta(weeks=1)) + ranges = make_date_ranges(min_start_date, max_end_date, timedelta(weeks=1)) # Run the pipeline in a loop for each 1 week range for start, end in ranges: print(f"Loading tickets between {start} and {end}") - data = zendesk_support(start_time=start, end_time=end).with_resources("tickets") + data = zendesk_support(start_date=start, end_date=end).with_resources("tickets") info = pipeline.run(data=data) print(info) # Backloading is done, now we continue loading with incremental state, starting where the backloading left off print(f"Loading with incremental state, starting at {end}") - data = zendesk_support(start_time=end).with_resources("tickets") + data = zendesk_support(start_date=end).with_resources("tickets") info = pipeline.run(data) print(info) diff --git a/tests/google_sheets/test_data_processing.py b/tests/google_sheets/test_data_processing.py index ad301fc60..aea0fef26 100644 --- a/tests/google_sheets/test_data_processing.py +++ b/tests/google_sheets/test_data_processing.py @@ -24,15 +24,15 @@ ), ( "https://docs.google.com/spreadsheets/", - ValueError("Invalid URL. Cannot find spreadsheet ID"), + None, ), ( "https://docs.google.com/spreadsheets/d", - ValueError("Invalid URL. Cannot find spreadsheet ID"), + None, ), ( "https://docs.google.com/spreadsheets/d/", - ValueError("Spreadsheet ID is an empty string"), + None, ), ] TEST_CASES_URL_OR_ID = [ @@ -54,19 +54,20 @@ ), ( "https://docs.google.com/spreadsheets/", - ValueError("Invalid URL. Cannot find spreadsheet ID"), + None, ), ( "https://docs.google.com/spreadsheets/d", - ValueError("Invalid URL. Cannot find spreadsheet ID"), + None, ), ( "https://docs.google.com/spreadsheets/d/", - ValueError("Spreadsheet ID is an empty string"), + None, ), ("1aBcDeFgHiJkLmNopQrStUvWxYz1234567890", "1aBcDeFgHiJkLmNopQrStUvWxYz1234567890"), ("", ""), ] + TEST_CASES_DATE = [ (37621, pendulum.datetime(year=2002, month=12, day=31, tz="UTC")), ( @@ -86,269 +87,6 @@ ), ), ] -TEST_CASES_RANGE = [ - ("sheet1", ["sheet1", "sheet1!1:2"]), - ("sheet1!G2:O28", ["sheet1", "sheet1!G2:O3"]), - ("sheet1!G2:H28", ["sheet1", "sheet1!G2:H3"]), - ("sheet1!A:B", ["sheet1", "sheet1!A1:B2"]), - ("sheet1!1:4", ["sheet1", "sheet1!1:2"]), - ("sheet1!AA23:BB55", ["sheet1", "sheet1!AA23:BB24"]), -] - -row_values_1 = [ - { - "userEnteredValue": {"stringValue": "test1"}, - "effectiveValue": {"stringValue": "test1"}, - "formattedValue": "test1", - "effectiveFormat": { - "backgroundColor": {"red": 1, "green": 1, "blue": 1}, - "padding": {"top": 2, "right": 3, "bottom": 2, "left": 3}, - "horizontalAlignment": "LEFT", - "verticalAlignment": "BOTTOM", - "wrapStrategy": "OVERFLOW_CELL", - "textFormat": { - "foregroundColor": {}, - "fontFamily": "Arial", - "fontSize": 10, - "bold": False, - "italic": False, - "strikethrough": False, - "underline": False, - "foregroundColorStyle": {"rgbColor": {}}, - }, - "hyperlinkDisplayType": "PLAIN_TEXT", - "backgroundColorStyle": {"rgbColor": {"red": 1, "green": 1, "blue": 1}}, - }, - }, - { - "userEnteredValue": {"numberValue": 1}, - "effectiveValue": {"numberValue": 1}, - "formattedValue": "1", - "effectiveFormat": { - "backgroundColor": {"red": 1, "green": 1, "blue": 1}, - "padding": {"top": 2, "right": 3, "bottom": 2, "left": 3}, - "horizontalAlignment": "RIGHT", - "verticalAlignment": "BOTTOM", - "wrapStrategy": "OVERFLOW_CELL", - "textFormat": { - "foregroundColor": {}, - "fontFamily": "Arial", - "fontSize": 10, - "bold": False, - "italic": False, - "strikethrough": False, - "underline": False, - "foregroundColorStyle": {"rgbColor": {}}, - }, - "hyperlinkDisplayType": "PLAIN_TEXT", - "backgroundColorStyle": {"rgbColor": {"red": 1, "green": 1, "blue": 1}}, - }, - }, - { - "userEnteredValue": {"numberValue": 1.01}, - "effectiveValue": {"numberValue": 1.01}, - "formattedValue": "1.01", - "effectiveFormat": { - "backgroundColor": {"red": 1, "green": 1, "blue": 1}, - "padding": {"top": 2, "right": 3, "bottom": 2, "left": 3}, - "horizontalAlignment": "RIGHT", - "verticalAlignment": "BOTTOM", - "wrapStrategy": "OVERFLOW_CELL", - "textFormat": { - "foregroundColor": {}, - "fontFamily": "Arial", - "fontSize": 10, - "bold": False, - "italic": False, - "strikethrough": False, - "underline": False, - "foregroundColorStyle": {"rgbColor": {}}, - }, - "hyperlinkDisplayType": "PLAIN_TEXT", - "backgroundColorStyle": {"rgbColor": {"red": 1, "green": 1, "blue": 1}}, - }, - }, - { - "userEnteredValue": {"boolValue": True}, - "effectiveValue": {"boolValue": True}, - "formattedValue": "TRUE", - "effectiveFormat": { - "backgroundColor": {"red": 1, "green": 1, "blue": 1}, - "padding": {"top": 2, "right": 3, "bottom": 2, "left": 3}, - "horizontalAlignment": "CENTER", - "verticalAlignment": "BOTTOM", - "wrapStrategy": "OVERFLOW_CELL", - "textFormat": { - "foregroundColor": {}, - "fontFamily": "Arial", - "fontSize": 10, - "bold": False, - "italic": False, - "strikethrough": False, - "underline": False, - "foregroundColorStyle": {"rgbColor": {}}, - }, - "hyperlinkDisplayType": "PLAIN_TEXT", - "backgroundColorStyle": {"rgbColor": {"red": 1, "green": 1, "blue": 1}}, - }, - }, - { - "userEnteredValue": {"formulaValue": "=B2+C2"}, - "effectiveValue": {"numberValue": 2.01}, - "formattedValue": "2.01", - "effectiveFormat": { - "backgroundColor": {"red": 1, "green": 1, "blue": 1}, - "padding": {"top": 2, "right": 3, "bottom": 2, "left": 3}, - "horizontalAlignment": "RIGHT", - "verticalAlignment": "BOTTOM", - "wrapStrategy": "OVERFLOW_CELL", - "textFormat": { - "foregroundColor": {}, - "fontFamily": "Arial", - "fontSize": 10, - "bold": False, - "italic": False, - "strikethrough": False, - "underline": False, - "foregroundColorStyle": {"rgbColor": {}}, - }, - "hyperlinkDisplayType": "PLAIN_TEXT", - "backgroundColorStyle": {"rgbColor": {"red": 1, "green": 1, "blue": 1}}, - }, - }, - { - "userEnteredValue": {"numberValue": 37621}, - "effectiveValue": {"numberValue": 37621}, - "formattedValue": "12/31/2002 0:00:00", - "userEnteredFormat": { - "numberFormat": { - "type": "DATE_TIME", - "pattern": 'm"/"d"/"yyyy" "h":"mm":"ss', - } - }, - "effectiveFormat": { - "numberFormat": { - "type": "DATE_TIME", - "pattern": 'm"/"d"/"yyyy" "h":"mm":"ss', - }, - "backgroundColor": {"red": 1, "green": 1, "blue": 1}, - "padding": {"top": 2, "right": 3, "bottom": 2, "left": 3}, - "horizontalAlignment": "RIGHT", - "verticalAlignment": "BOTTOM", - "wrapStrategy": "OVERFLOW_CELL", - "textFormat": { - "foregroundColor": {}, - "fontFamily": "Arial", - "fontSize": 10, - "bold": False, - "italic": False, - "strikethrough": False, - "underline": False, - "foregroundColorStyle": {"rgbColor": {}}, - }, - "hyperlinkDisplayType": "PLAIN_TEXT", - "backgroundColorStyle": {"rgbColor": {"red": 1, "green": 1, "blue": 1}}, - }, - }, -] -row_values_2 = [ - { - "userEnteredValue": {"numberValue": 3}, - "effectiveValue": {"numberValue": 3}, - "formattedValue": "3.00E+00", - "userEnteredFormat": { - "numberFormat": {"type": "SCIENTIFIC", "pattern": "0.00E+00"} - }, - "effectiveFormat": { - "numberFormat": {"type": "SCIENTIFIC", "pattern": "0.00E+00"}, - "backgroundColor": {"red": 1, "green": 1, "blue": 1}, - "padding": {"top": 2, "right": 3, "bottom": 2, "left": 3}, - "horizontalAlignment": "RIGHT", - "verticalAlignment": "BOTTOM", - "wrapStrategy": "OVERFLOW_CELL", - "textFormat": { - "foregroundColor": {}, - "fontFamily": "Arial", - "fontSize": 10, - "bold": False, - "italic": False, - "strikethrough": False, - "underline": False, - "foregroundColorStyle": {"rgbColor": {}}, - }, - "hyperlinkDisplayType": "PLAIN_TEXT", - "backgroundColorStyle": {"rgbColor": {"red": 1, "green": 1, "blue": 1}}, - }, - }, - {}, - { - "userEnteredValue": {"numberValue": 3}, - "effectiveValue": {"numberValue": 3}, - "formattedValue": "3", - "effectiveFormat": { - "backgroundColor": {"red": 1, "green": 1, "blue": 1}, - "padding": {"top": 2, "right": 3, "bottom": 2, "left": 3}, - "horizontalAlignment": "RIGHT", - "verticalAlignment": "BOTTOM", - "wrapStrategy": "OVERFLOW_CELL", - "textFormat": { - "foregroundColor": {}, - "fontFamily": "Arial", - "fontSize": 10, - "bold": False, - "italic": False, - "strikethrough": False, - "underline": False, - "foregroundColorStyle": {"rgbColor": {}}, - }, - "hyperlinkDisplayType": "PLAIN_TEXT", - "backgroundColorStyle": {"rgbColor": {"red": 1, "green": 1, "blue": 1}}, - }, - }, - { - "userEnteredValue": {"numberValue": 3.6}, - "effectiveValue": {"numberValue": 3.6}, - "formattedValue": "3.6", - "effectiveFormat": { - "backgroundColor": {"red": 1, "green": 1, "blue": 1}, - "padding": {"top": 2, "right": 3, "bottom": 2, "left": 3}, - "horizontalAlignment": "RIGHT", - "verticalAlignment": "BOTTOM", - "wrapStrategy": "OVERFLOW_CELL", - "textFormat": { - "foregroundColor": {}, - "fontFamily": "Arial", - "fontSize": 10, - "bold": False, - "italic": False, - "strikethrough": False, - "underline": False, - "foregroundColorStyle": {"rgbColor": {}}, - }, - "hyperlinkDisplayType": "PLAIN_TEXT", - "backgroundColorStyle": {"rgbColor": {"red": 1, "green": 1, "blue": 1}}, - }, - }, -] -TEST_CASES_DATA_TYPES = [ - (row_values_1, [False, False, False, False, False, True]), - (row_values_2, [False, False, False, False]), -] -TEST_CASES_CONVERT_COL = [ - (0, "A"), - (1, "B"), - (25, "Z"), - (26, "AA"), - (27, "AB"), - (51, "AZ"), - (52, "BA"), - (53, "BB"), - (675, "YZ"), - (676, "ZA"), - (700, "ZY"), - (701, "ZZ"), - (702, "AAA"), -] @pytest.mark.parametrize("url, expected", TEST_CASES_URL) @@ -359,9 +97,9 @@ def test_process_url(url: str, expected: str): :param: expected: expected output str """ try: - assert data_processing.process_url(url) == expected - except ValueError as e: - assert str(e) == str(expected) + assert data_processing.extract_spreadsheet_id_from_url(url) == expected + except ValueError: + assert expected is None @pytest.mark.parametrize("url_or_id, expected", TEST_CASES_URL_OR_ID) @@ -373,8 +111,8 @@ def test_get_spreadsheet_id(url_or_id: str, expected: str): """ try: assert data_processing.get_spreadsheet_id(url_or_id) == expected - except ValueError as e: - assert str(e) == str(expected) + except ValueError: + assert expected is None @pytest.mark.parametrize("serial_number, expected", TEST_CASES_DATE) @@ -386,19 +124,6 @@ def test_serial_date_to_datetime( :param: serial_number- float or int date input :param: expected: expected output datetime """ - assert data_processing.serial_date_to_datetime(serial_number) == expected - - -@pytest.mark.parametrize("sheet_range, expected", TEST_CASES_RANGE) -def test_get_first_rows(sheet_range: str, expected: str): - assert data_processing.get_first_rows(sheet_range) == expected - - -@pytest.mark.parametrize("value_dict_row, expected", TEST_CASES_DATA_TYPES) -def test_is_date_datatype(value_dict_row: List[DictStrAny], expected: bool): - assert data_processing.is_date_datatype(value_dict_row) == expected - - -@pytest.mark.parametrize("col_idx, expected", TEST_CASES_CONVERT_COL) -def test_convert_col_a1(col_idx: int, expected: str): - assert data_processing._convert_col_a1(col_idx) == expected + assert ( + data_processing.serial_date_to_datetime(serial_number, "timestamp") == expected + ) diff --git a/tests/google_sheets/test_google_sheets_source.py b/tests/google_sheets/test_google_sheets_source.py index 510d1eeac..f6e93970b 100644 --- a/tests/google_sheets/test_google_sheets_source.py +++ b/tests/google_sheets/test_google_sheets_source.py @@ -1,23 +1,32 @@ import logging +from typing import Tuple import pytest import dlt from dlt.common.pipeline import LoadInfo from sources.google_sheets import google_spreadsheet -from tests.utils import ALL_DESTINATIONS, assert_load_info +from tests.utils import ( + ALL_DESTINATIONS, + assert_load_info, + assert_query_data, + load_table_counts, +) # list expected tables and the number of columns they are supposed to have -ALL_TABLES = [ +ALL_RANGES = [ + "empty", "all_types", "empty_row", "empty_rows", "has_empty", "hole_middle", + "table_in_middle", "inconsistent_types", "more_data", "more_headers_than_data", "NamedRange1", + "NamedRange2", "only_data", "only_headers", "Sheet 1", @@ -26,19 +35,25 @@ "sheet4", "two_tables", ] -COL_NUMS = [6, 5, 5, 5, 7, 5, 4, 8, 4, 2, 2, 4, 5, 9, 1, 9] + +SKIPPED_RANGES = [ + "empty", + "only_data", + "only_headers", + "NamedRange2", +] + ALL_TABLES_LOADED = [ "all_types", "empty_row", "empty_rows", "has_empty", "hole_middle", + "table_in_middle", "inconsistent_types", "more_data", "more_headers_than_data", "named_range1", - "only_data", - "only_headers", "sheet_1", "sheet2", "sheet3", @@ -48,55 +63,30 @@ ] -def create_pipeline( - destination_name, - dataset_name, - full_refresh=True, - range_names=None, - get_sheets=True, - get_named_ranges=True, -) -> (LoadInfo, dlt.Pipeline): - """ - Helper, creates a simple pipeline and returns it along with the load info. - """ - pipeline = dlt.pipeline( - destination=destination_name, - full_refresh=full_refresh, - dataset_name=dataset_name, - ) - data = google_spreadsheet( - range_names=range_names, - get_sheets=get_sheets, - get_named_ranges=get_named_ranges, - ) - info = pipeline.run(data) - return info, pipeline - - -def test_sample_load() -> None: +def test_single_explicit_range_load() -> None: """ Tests access for a spreadsheet in config.toml and check that the pipeline was loaded correctly. """ - info = create_pipeline( + info, pipeline = _run_pipeline( destination_name="postgres", dataset_name="test_google_sheet_data", range_names=["Sheet 1"], get_sheets=False, get_named_ranges=False, - )[0] + ) assert_load_info(info) + user_tables = pipeline.default_schema.data_tables() + assert set([t["name"] for t in user_tables]) == {"sheet_1", "spreadsheet_info"} @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) def test_full_load(destination_name: str) -> None: """ Sample pipeline run for all ranges in test1 spreadsheet. Checks the correct amount of tables is created in the database. - @:param: destination_name - redshift/bigquery/postgres """ - # FULL PIPELINE RUN - info, pipeline = create_pipeline( + info, pipeline = _run_pipeline( destination_name=destination_name, dataset_name="test_full_load" ) assert_load_info(info) @@ -109,59 +99,78 @@ def test_full_load(destination_name: str) -> None: # check load metadata with pipeline.sql_client() as c: - # check every table has the correct name in the metadata table and the correct number of columns - sql_query = "SELECT loaded_range, num_cols FROM spreadsheet_info ORDER BY LOWER(loaded_range);" + # check every table has the correct name in the metadata table + sql_query = "SELECT range_name, skipped FROM spreadsheet_info" with c.execute_query(sql_query) as cur: rows = list(cur.fetchall()) - assert len(rows) == len(ALL_TABLES) - for i in range(len(rows)): - assert rows[i][0] == ALL_TABLES[i] - assert rows[i][1] == COL_NUMS[i] + loaded_ranges = [r[0] for r in rows] + assert set(loaded_ranges) == set(ALL_RANGES) + skipped_ranges = [r[0] for r in rows if r[1]] + assert set(skipped_ranges) == set(SKIPPED_RANGES) @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) -def test_appending(destination_name) -> None: +def test_custom_ranges(destination_name) -> None: """ Test that adding new data in the sheets will add new data to the destinations - @:param: destination_name - redshift/bigquery/postgres """ # Fetch ranges from pipeline and check test_ranges = ["Sheet 1!A1:D2", "Sheet 1!A1:D4"] test_ranges_table = ["sheet_1_a1_d2", "sheet_1_a1_d4"] - info, pipeline = create_pipeline( + info, pipeline = _run_pipeline( destination_name=destination_name, - dataset_name="test_appending", + dataset_name="test_custom_ranges", range_names=test_ranges, get_sheets=False, get_named_ranges=False, ) assert_load_info(info) - # TODO: decide what needs to be done when range is slightly increased + # test2 column is missing because it contains no data + assert set(pipeline.default_schema.get_table_columns("sheet_1_a1_d2").keys()) == { + "test", + "add1", + "add2", + "_dlt_id", + "_dlt_load_id", + } + # in the second table a variant column is created + assert set(pipeline.default_schema.get_table_columns("sheet_1_a1_d4").keys()) == { + "test", + "add1", + "add2", + "add2__v_text", + "_dlt_id", + "_dlt_load_id", + } + # check table rows are appended with pipeline.sql_client() as c: sql_query1 = f"SELECT * FROM {test_ranges_table[0]};" - sql_query2 = f"SELECT * FROM {test_ranges_table[1]};" with c.execute_query(sql_query1) as cur: rows = list(cur.fetchall()) assert len(rows) == 1 + assert rows[0][:-2] == (3, 3, 3.6) + + sql_query2 = f"SELECT * FROM {test_ranges_table[1]} ORDER BY test;" with c.execute_query(sql_query2) as cur: rows = list(cur.fetchall()) assert len(rows) == 3 + # check variant column value + assert rows[1][-1] == '"test"' @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) def test_all_data_types(destination_name) -> None: """ Test all data types are recognized correctly. Just checks all columns are formatted as the same data type. - @:param: destination_name - redshift/bigquery/postgres """ table_name_db = "all_types" # run pipeline only for the specific table with all data types and grab that table - info, pipeline_types = create_pipeline( + info, pipeline_types = _run_pipeline( destination_name=destination_name, dataset_name="test_all_data_types", range_names=["all_types"], @@ -173,7 +182,6 @@ def test_all_data_types(destination_name) -> None: schema = pipeline_types.default_schema assert table_name_db in schema.tables - # pipeline doesn't reset schema.data_tables when run with other tests, so we have to check all the tables in the schema and check that the name matches test_table = schema.get_table(table_name_db) # check all columns assert test_table["columns"]["text_types"]["data_type"] == "text" @@ -181,68 +189,41 @@ def test_all_data_types(destination_name) -> None: assert test_table["columns"]["float_types"]["data_type"] == "double" assert test_table["columns"]["bool_types"]["data_type"] == "bool" assert test_table["columns"]["formula_types"]["data_type"] == "double" - assert test_table["columns"]["date_types"]["data_type"] == "timestamp" + assert test_table["columns"]["datetime_types"]["data_type"] == "timestamp" + assert test_table["columns"]["date_types"]["data_type"] == "date" + assert test_table["columns"]["time_types"]["data_type"] == "timestamp" @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) -def test_empty_row(destination_name) -> None: +def test_empty_rows(destination_name) -> None: """ Test ranges with a single empty row are processed correctly - @:param: destination_name - redshift/bigquery/postgres """ - # run pipeline only for the specific table with all data types and grab that table - info, pipeline = create_pipeline( + info, pipeline = _run_pipeline( destination_name=destination_name, dataset_name="test_empty_row", - range_names=["empty_row"], - get_named_ranges=False, - get_sheets=False, - ) - assert_load_info(info) - - # check table rows are appended - with pipeline.sql_client() as c: - sql_query = "SELECT * FROM empty_row;" - with c.execute_query(sql_query) as cur: - rows = list(cur.fetchall()) - assert len(rows) == 10 - - -@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) -def test_empty_rows(destination_name) -> None: - """ - Test ranges with multiple empty rows are processed correctly - @:param: destination_name - redshift/bigquery/postgres - """ - - # run pipeline only for the specific table with all data types and grab that table - info, pipeline = create_pipeline( - destination_name=destination_name, - dataset_name="test_empty_rows", - range_names=["empty_rows"], + range_names=["empty_row", "empty_rows"], get_named_ranges=False, get_sheets=False, ) assert_load_info(info) # check table rows are appended - with pipeline.sql_client() as c: - sql_query = "SELECT * FROM empty_rows;" - with c.execute_query(sql_query) as cur: - rows = list(cur.fetchall()) - assert len(rows) == 9 + assert load_table_counts(pipeline, "empty_row", "empty_rows") == { + "empty_row": 10, + "empty_rows": 9, + } @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) -def test_has_empty(destination_name) -> None: +def test_has_nulls_for_empty_cells(destination_name) -> None: """ Test ranges with random null values are processed correctly - @:param: destination_name - redshift/bigquery/postgres """ # run pipeline only for the specific table with all data types and grab that table - info, pipeline = create_pipeline( + info, pipeline = _run_pipeline( destination_name=destination_name, dataset_name="test_has_empty", range_names=["has_empty"], @@ -253,12 +234,7 @@ def test_has_empty(destination_name) -> None: # check table rows are appended with pipeline.sql_client() as c: - sql_query1 = "SELECT * FROM has_empty;" check_null_query = "SELECT * FROM has_empty WHERE redi2 is Null OR test2 is Null or date_test is Null;" - # check num rows - with c.execute_query(sql_query1) as cur: - rows = list(cur.fetchall()) - assert len(rows) == 9 # check specific values are null with c.execute_query(check_null_query) as cur: rows = list(cur.fetchall()) @@ -270,11 +246,10 @@ def test_has_empty(destination_name) -> None: def test_inconsistent_types(destination_name) -> None: """ Test ranges that have different data types in a single column - @:param: destination_name - redshift/bigquery/postgres """ # run pipeline only for the specific table with all data types and grab that table - info, pipeline = create_pipeline( + info, pipeline = _run_pipeline( destination_name=destination_name, dataset_name="test_inconsistent_types", range_names=["inconsistent_types"], @@ -282,6 +257,23 @@ def test_inconsistent_types(destination_name) -> None: get_named_ranges=False, ) assert_load_info(info) + # mind that "date_test" column has bool and text variants + assert set( + pipeline.default_schema.get_table_columns("inconsistent_types").keys() + ) == { + "float_test", + "bool_test", + "test2", + "date_test__v_text", + "redi2", + "bool_test__v_text", + "test2__v_text", + "_dlt_load_id", + "_dlt_id", + "date_test__v_bool", + "redi2__v_double", + "date_test", + } with pipeline.sql_client() as c: sql_query = ( @@ -289,6 +281,7 @@ def test_inconsistent_types(destination_name) -> None: "test2__v_text is not Null " "OR redi2__v_double is not Null " "OR date_test__v_text is not Null " + "OR date_test__v_bool is not Null " "OR bool_test__v_text is not Null;" ) with c.execute_query(sql_query) as cur: @@ -298,14 +291,11 @@ def test_inconsistent_types(destination_name) -> None: @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) -def test_more_headers(destination_name) -> None: +def test_more_headers_than_data(destination_name) -> None: """ - Test ranges that have more headers than data - @:param: destination_name - redshift/bigquery/postgres + Test ranges that have more headers than data. Columns with headers and without data are dropped. """ - - # run pipeline only for the specific table with all data types and grab that table - info, pipeline = create_pipeline( + info, pipeline = _run_pipeline( destination_name=destination_name, dataset_name="test_more_headers", range_names=["more_headers_than_data"], @@ -314,6 +304,11 @@ def test_more_headers(destination_name) -> None: ) assert_load_info(info) + # no extra columns in schema + assert {"extra_header", "extra_header2"}.intersection( + set(pipeline.default_schema.get_table_columns("more_headers_than_data").keys()) + ) == set() + # run query to check number of columns with pipeline.sql_client() as c: sql_query = "SELECT * FROM more_headers_than_data;" @@ -325,13 +320,11 @@ def test_more_headers(destination_name) -> None: @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) -def test_more_data(destination_name) -> None: +def test_more_data_than_headers(destination_name) -> None: """ - Test ranges that have more data than headers - @:param: destination_name - redshift/bigquery/postgres + Test ranges that have more data than headers. Columns without headers will be dropped! """ - # run pipeline only for the specific table with all data types and grab that table - info, pipeline = create_pipeline( + info, pipeline = _run_pipeline( destination_name=destination_name, dataset_name="test_more_headers", range_names=["more_data"], @@ -339,26 +332,24 @@ def test_more_data(destination_name) -> None: get_named_ranges=False, ) assert_load_info(info) - pipeline_schema = pipeline.default_schema - logging.warning(pipeline_schema) - with pipeline.sql_client() as c: - sql_query = "SELECT * FROM more_data;" - with c.execute_query(sql_query) as cur: - rows = list(cur.fetchall()) - for row in rows: - # each row must have 6 columns(including the 2 dlt ones) - assert len(row) == 6 + assert set(pipeline.default_schema.get_table_columns("more_data").keys()) == { + "text_types", + "number_types", + "float_types", + "bool_types", + "_dlt_load_id", + "_dlt_id", + } @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) -def test_two_tables(destination_name) -> None: +def test_two_overlapping_tables(destination_name) -> None: """ - Test ranges that have 2 tables inside the range - @:param: destination_name - redshift/bigquery/postgres + Test ranges that have 2 tables inside the range - standard rules apply so only parts of the second table overlapping with columns of the first table will appear """ # run pipeline only for the specific table with all data types and grab that table - info, pipeline = create_pipeline( + info, pipeline = _run_pipeline( destination_name=destination_name, dataset_name="test_two_tables", range_names=["two_tables"], @@ -367,24 +358,37 @@ def test_two_tables(destination_name) -> None: ) assert_load_info(info) - with pipeline.sql_client() as c: - # this query will return all rows from 2nd table appended to the 1st table - sql_query = "SELECT * FROM two_tables WHERE _10 is NULL;" - with c.execute_query(sql_query) as cur: - rows = list(cur.fetchall()) - # 11 rows with inconsistent types expected - assert len(rows) == 11 + # all the headers are automatic + headers = [ + h + for h in pipeline.default_schema.get_table_columns("two_tables").keys() + if h.startswith("col") + ] + assert len(headers) == 9 + + # number of rows from two tables + assert load_table_counts(pipeline, "two_tables") == {"two_tables": 22} + + # assert first column + assert_query_data( + pipeline, + "SELECT col_1 FROM two_tables ORDER BY col_1 NULLS FIRST", + [None] * 11 + list(range(10, 21)), + ) + # assert first overlapped column + assert_query_data( + pipeline, "SELECT col_7 FROM two_tables ORDER BY col_7 ASC", list(range(1, 23)) + ) @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) def test_hole_middle(destination_name) -> None: """ Test ranges that have 2 tables inside the range - @:param: destination_name - redshift/bigquery/postgres """ # run pipeline only for the specific table with all data types and grab that table - info, pipeline = create_pipeline( + info, pipeline = _run_pipeline( destination_name=destination_name, dataset_name="test_hole_middle", range_names=["hole_middle"], @@ -392,20 +396,17 @@ def test_hole_middle(destination_name) -> None: get_named_ranges=False, ) assert_load_info(info) - - with pipeline.sql_client() as c: - # this query will return all rows from 2nd table appended to the 1st table - sql_query = "SELECT * FROM hole_middle;" - with c.execute_query(sql_query) as cur: - rows = list(cur.fetchall()) - # 10 rows and 7 columns (including dlt ones) expected - assert len(rows) == 10 - for row in rows: - assert len(row) == 7 + assert load_table_counts(pipeline, "hole_middle") == {"hole_middle": 11} + headers = [ + h + for h in pipeline.default_schema.get_table_columns("hole_middle").keys() + if h.startswith("col") + ] + assert len(headers) == 5 @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) -def test_named_range(destination_name) -> None: +def test_explicit_named_range(destination_name) -> None: """ Test that everything inside a named range is loaded correctly @:param: destination_name - redshift/bigquery/postgres @@ -415,43 +416,123 @@ def test_named_range(destination_name) -> None: # run pipeline only for the specific table with all data types and grab that table # with these settings, the pipeline should only have the named_range1 table inside. - info, pipeline = create_pipeline( + info, pipeline = _run_pipeline( destination_name=destination_name, dataset_name="test_named_range", + range_names=["NamedRange1", "NamedRange2"], get_sheets=False, - get_named_ranges=True, + get_named_ranges=False, ) assert_load_info(info) # check columns have the correct data types in the schema - # pipeline doesn't reset schema.data_tables when run with other tests, so we have to check all the tables in the schema and check that the name matches schema = pipeline.default_schema assert table_name_db in schema.tables + # this one was skipped + assert "named_range2" not in schema.tables test_table = schema.get_table(table_name_db) - # check all column data types are correct - assert test_table["columns"]["test3"]["data_type"] == "text" - assert test_table["columns"]["_3"]["data_type"] == "bigint" - assert test_table["columns"]["_1_03"]["data_type"] == "double" - assert test_table["columns"]["true"]["data_type"] == "bool" + # check all column data types are correct - also order must match + assert [ + c["data_type"] + for c in test_table["columns"].values() + if not c["name"].startswith("_dlt") + ] == ["text", "bigint", "double", "bool"] # check all values are saved correctly expected_rows = [ + ("test3", 3, 1.03, True), ("test4", 4, 1.04, True), ("test5", 5, 1.05, True), ("test6", 6, 1.06, True), ] + # perform queries to check data inside with pipeline.sql_client() as c: - sql_query = f"SELECT test3, _3, _1_03, true FROM {table_name_db};" + quoted_range = c.capabilities.escape_identifier("range") + # columns are auto named - we hit a middle of a table with this range + sql_query = f"SELECT col_1, col_2, col_3, col_4 FROM {table_name_db};" with c.execute_query(sql_query) as cur: rows = list(cur.fetchall()) - # 3 rows and 4 columns expected - assert len(rows) == 3 + assert len(rows) == 4 for i in range(len(rows)): processed_row = _row_helper(rows[i], destination_name) assert processed_row == expected_rows[i] + # check spreadsheet info + assert_query_data( + pipeline, + f"SELECT {quoted_range} FROM spreadsheet_info ORDER BY {quoted_range} ASC", + ["empty!ZY1:AAA4", "more_data!A4:D7"], + ) + + +def test_invalid_range(): + with pytest.raises(Exception) as py_ex: + _run_pipeline( + destination_name="duckdb", + dataset_name="test_named_range", + range_names=["NamedRangeNotExists"], + get_sheets=False, + get_named_ranges=False, + ) + assert "Unable to parse range: NamedRangeNotExists" in str(py_ex.value) + + +def test_auto_header_names(): + pass + + +def test_no_ranges(): + # no ranges to extract + info, pipeline = _run_pipeline( + destination_name="duckdb", + dataset_name="test_table_in_middle", + get_sheets=False, + get_named_ranges=False, + ) + # spreadsheet_info was empty so not created + assert pipeline.default_schema.data_tables() == [] + assert_load_info(info) + + +def test_table_not_A1(): + # make sure all data is loaded when it does not start at A1 + info, pipeline = _run_pipeline( + destination_name="duckdb", + dataset_name="test_table_in_middle", + range_names=["table_in_middle"], + get_sheets=False, + get_named_ranges=False, + ) + assert_load_info(info) + + # 11 rows + assert load_table_counts(pipeline, "table_in_middle") == {"table_in_middle": 11} + # 9 auto generated headers + headers = [ + h + for h in pipeline.default_schema.get_table_columns("table_in_middle").keys() + if h.startswith("col") + ] + assert len(headers) == 9 + # check range + assert_query_data( + pipeline, + "SELECT range FROM spreadsheet_info ORDER BY range ASC", + ["table_in_middle!AB9:AJ1000"], + ) + # check first column + assert_query_data( + pipeline, + "SELECT col_1 FROM table_in_middle ORDER BY col_1 ASC", + list(map(str, range(11, 21))) + ["AB9_head"], + ) + # check last column + assert_query_data( + pipeline, "SELECT col_9 FROM table_in_middle ORDER BY col_9 ASC", range(90, 101) + ) + def _row_helper(row, destination_name): """ @@ -466,3 +547,29 @@ def _row_helper(row, destination_name): else: # redshift & postgres return row + + +def _run_pipeline( + destination_name, + dataset_name, + full_refresh=True, + range_names=None, + get_sheets=True, + get_named_ranges=True, +) -> Tuple[LoadInfo, dlt.Pipeline]: + """ + Helper, creates a simple pipeline and returns it along with the load info. + """ + pipeline = dlt.pipeline( + destination=destination_name, + full_refresh=full_refresh, + dataset_name=dataset_name, + ) + data = google_spreadsheet( + "1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580", + range_names=range_names, + get_sheets=get_sheets, + get_named_ranges=get_named_ranges, + ) + info = pipeline.run(data) + return info, pipeline diff --git a/tests/shopify_dlt/test_date_helper.py b/tests/shopify_dlt/test_date_helper.py deleted file mode 100644 index 37de9b5eb..000000000 --- a/tests/shopify_dlt/test_date_helper.py +++ /dev/null @@ -1,52 +0,0 @@ -import pytest -from datetime import datetime, date, timezone # noqa: I251 -from dlt.common import pendulum - -from sources.shopify_dlt.date_helper import ensure_pendulum_datetime, TAnyDateTime - - -test_params = [ - # python datetime without tz - ( - datetime(2021, 1, 1, 0, 0, 0), - pendulum.datetime(2021, 1, 1, 0, 0, 0).in_tz("UTC"), - ), - # python datetime with tz - ( - datetime(2021, 1, 1, 0, 0, 0, tzinfo=timezone.utc), - pendulum.datetime(2021, 1, 1, 0, 0, 0).in_tz("UTC"), - ), - # python date object - (date(2021, 1, 1), pendulum.datetime(2021, 1, 1, 0, 0, 0).in_tz("UTC")), - # pendulum datetime with tz - ( - pendulum.datetime(2021, 1, 1, 0, 0, 0).in_tz("UTC"), - pendulum.datetime(2021, 1, 1, 0, 0, 0).in_tz("UTC"), - ), - # pendulum datetime without tz - ( - pendulum.datetime(2021, 1, 1, 0, 0, 0), - pendulum.datetime(2021, 1, 1, 0, 0, 0).in_tz("UTC"), - ), - # iso datetime in UTC - ("2021-01-01T00:00:00+00:00", pendulum.datetime(2021, 1, 1, 0, 0, 0).in_tz("UTC")), - # iso datetime with non utc tz - ( - "2021-01-01T00:00:00+05:00", - pendulum.datetime(2021, 1, 1, 0, 0, 0, tz=5), - ), - # iso datetime without tz - ( - "2021-01-01T05:02:32", - pendulum.datetime(2021, 1, 1, 5, 2, 32).in_tz("UTC"), - ), - # iso date - ("2021-01-01", pendulum.datetime(2021, 1, 1, 0, 0, 0).in_tz("UTC")), -] - - -@pytest.mark.parametrize("date_value, expected", test_params) -def test_ensure_pendulum_datetime( - date_value: TAnyDateTime, expected: pendulum.DateTime -) -> None: - assert ensure_pendulum_datetime(date_value) == expected diff --git a/tests/shopify_dlt/test_shopify_source.py b/tests/shopify_dlt/test_shopify_source.py index 02bd820b4..bb9f34bdb 100644 --- a/tests/shopify_dlt/test_shopify_source.py +++ b/tests/shopify_dlt/test_shopify_source.py @@ -5,11 +5,11 @@ from requests_mock import Mocker import dlt from dlt.common import pendulum +from dlt.common.time import ensure_pendulum_datetime from dlt.sources.helpers import requests from tests.utils import ALL_DESTINATIONS, assert_load_info, load_table_counts from sources.shopify_dlt import shopify_source -from sources.shopify_dlt.date_helper import ensure_pendulum_datetime @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) diff --git a/tests/utils.py b/tests/utils.py index 1543443f9..77dec67dc 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -174,6 +174,7 @@ def assert_query_data( with p.sql_client(schema_name=schema_name) as c: with c.execute_query(sql) as cur: rows = list(cur.fetchall()) + print(rows) assert len(rows) == len(table_data) for row, d in zip(rows, table_data): row = list(row) @@ -195,10 +196,13 @@ def assert_load_info(info: LoadInfo, expected_load_packages: int = 1) -> None: def load_table_counts(p: dlt.Pipeline, *table_names: str) -> DictStrAny: """Returns row counts for `table_names` as dict""" - query = "\nUNION ALL\n".join( - [f"SELECT '{name}' as name, COUNT(1) as c FROM {name}" for name in table_names] - ) with p.sql_client() as c: + query = "\nUNION ALL\n".join( + [ + f"SELECT '{name}' as name, COUNT(1) as c FROM {c.make_qualified_table_name(name)}" + for name in table_names + ] + ) with c.execute_query(query) as cur: rows = list(cur.fetchall()) return {r[0]: r[1] for r in rows} @@ -208,16 +212,14 @@ def load_table_distinct_counts( p: dlt.Pipeline, distinct_column: str, *table_names: str ) -> DictStrAny: """Returns counts of distinct values for column `distinct_column` for `table_names` as dict""" - query = "\nUNION ALL\n".join( - [ - f"SELECT '{name}' as name, COUNT(DISTINCT {distinct_column}) as c FROM {name}" - for name in table_names - ] - ) with p.sql_client() as c: + query = "\nUNION ALL\n".join( + [ + f"SELECT '{name}' as name, COUNT(DISTINCT {distinct_column}) as c FROM {c.make_qualified_table_name(name)}" + for name in table_names + ] + ) + with c.execute_query(query) as cur: rows = list(cur.fetchall()) return {r[0]: r[1] for r in rows} - - -# def assert_tables_filled() diff --git a/tests/zendesk/test_zendesk_source.py b/tests/zendesk/test_zendesk_source.py index 1fc17ffab..3a10a715c 100644 --- a/tests/zendesk/test_zendesk_source.py +++ b/tests/zendesk/test_zendesk_source.py @@ -1,3 +1,4 @@ +from unittest.mock import patch import pytest from typing import List, Dict, Any, Iterable import dlt @@ -19,6 +20,7 @@ SUPPORT_TABLES = { "ticket_fields", "tickets", + "ticket_events", "ticket_metric_events", "users", "sla_policies", @@ -88,6 +90,7 @@ INCREMENTAL_TABLES = [ "tickets", "ticket_metric_events", + "ticket_events", "chats", ] # calls_incremental and legs_incremental have no data so not added here yet @@ -148,16 +151,23 @@ def test_incrementing(destination_name: str) -> None: @:param: destination_name - redshift/bigquery/postgres """ - # run pipeline - pipeline_incremental = _create_pipeline( - destination_name=destination_name, - full_refresh=True, - dataset_name="test_incremental", - include_chat=True, - include_support=True, - include_talk=True, - ) + with patch("sources.zendesk.helpers.talk_api.settings.INCREMENTAL_PAGE_SIZE", 2): + # run pipeline + pipeline_incremental = _create_pipeline( + destination_name=destination_name, + full_refresh=True, + dataset_name="test_incremental", + include_chat=True, + include_support=True, + include_talk=True, + ) counts = load_table_counts(pipeline_incremental, *INCREMENTAL_TABLES) + assert counts == { + "tickets": 5, + "chats": 3, + "ticket_events": 10, + "ticket_metric_events": 65, + } # run pipeline again and check that the number of distinct data points hasn't changed info = pipeline_incremental.run( @@ -169,7 +179,7 @@ def test_incrementing(destination_name: str) -> None: @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) -def test_tickets_end_time_incremental(destination_name: str) -> None: +def test_tickets_end_date_incremental(destination_name: str) -> None: """Test time range loading tickets with end_value and incremental""" pipeline = dlt.pipeline( destination=destination_name, @@ -180,10 +190,10 @@ def test_tickets_end_time_incremental(destination_name: str) -> None: # This ticket should be included in results first_ticket_time = parse_iso_like_datetime("2023-02-06T09:52:18Z") # End is exact ts of a ticket in the middle - end_time = parse_iso_like_datetime("2023-07-18T17:14:39Z") + end_date = parse_iso_like_datetime("2023-07-18T17:14:39Z") data = zendesk_support( - start_time=first_ticket_time, - end_time=end_time, + start_date=first_ticket_time, + end_date=end_date, ).with_resources("tickets") info = pipeline.run(data, write_disposition="append") @@ -198,10 +208,10 @@ def test_tickets_end_time_incremental(destination_name: str) -> None: ] assert first_ticket_time in rows - assert all(value < end_time for value in rows) + assert all(value < end_date for value in rows) - # Load again incremental from end_time - data = zendesk_support(start_time=end_time) + # Load again incremental from end_date + data = zendesk_support(start_date=end_date) info = pipeline.run(data, write_disposition="append") assert_load_info(info) with pipeline.sql_client() as client: @@ -213,9 +223,9 @@ def test_tickets_end_time_incremental(destination_name: str) -> None: ] assert len(rows2) > len(rows) - assert end_time in rows2 + assert end_date in rows2 # Some rows are after the start time - assert [value for value in rows2 if value > end_time] + assert [value for value in rows2 if value > end_date] # Run incremental again, no new data should be added data = zendesk_support() @@ -241,12 +251,35 @@ def test_full_load_support(destination_name: str) -> None: """ # FULL PIPELINE RUN - pipeline = _create_pipeline( - destination_name=destination_name, - dataset_name="test_full_load", - include_support=True, - ) + with patch("sources.zendesk.helpers.talk_api.settings.PAGE_SIZE", 2): + pipeline = _create_pipeline( + destination_name=destination_name, + dataset_name="test_full_load", + include_support=True, + ) _check_pipeline_has_tables(pipeline=pipeline, tables=SUPPORT_TABLES) + counts = load_table_counts(pipeline, *SUPPORT_TABLES) + assert counts == { + "ticket_forms": 2, + "ticket_fields": 12, + "users": 3, + "views": 8, + "custom_agent_roles": 7, + "organization_memberships": 1, + "tickets": 5, + "macros": 2, + "brands": 1, + "tags": 3, + "ticket_metrics": 5, + "triggers": 7, + "ticket_events": 10, + "organizations": 1, + "ticket_metric_events": 65, + "automations": 3, + "recipient_addresses": 1, + "group_memberships": 1, + "groups": 1, + } @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) @@ -256,7 +289,6 @@ def test_zendesk_chat(destination_name: str) -> None: @:param: destination_name - redshift/bigquery/postgres """ - # FULL PIPELINE RUN pipeline = _create_pipeline( destination_name=destination_name, dataset_name="test_full_load",