From fa14af51fcae38eafeb35f697fb500be8c6db7ed Mon Sep 17 00:00:00 2001 From: Ryan Eakman <6326532+eakmanrq@users.noreply.github.com> Date: Tue, 21 May 2024 21:12:52 -0700 Subject: [PATCH] fix: expand hash length to avoid collision (#17) --- sqlframe/base/dataframe.py | 2 +- tests/unit/standalone/test_dataframe.py | 20 +++++++++---------- .../unit/standalone/test_dataframe_writer.py | 12 +++++------ tests/unit/standalone/test_session.py | 2 +- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/sqlframe/base/dataframe.py b/sqlframe/base/dataframe.py index d5764dc..aa235bc 100644 --- a/sqlframe/base/dataframe.py +++ b/sqlframe/base/dataframe.py @@ -417,7 +417,7 @@ def _create_hash_from_expression(self, expression: exp.Expression) -> str: from sqlframe.base.session import _BaseSession value = expression.sql(dialect=_BaseSession().input_dialect).encode("utf-8") - hash = f"t{zlib.crc32(value)}"[:6] + hash = f"t{zlib.crc32(value)}"[:9] return self.session._normalize_string(hash) def _get_select_expressions( diff --git a/tests/unit/standalone/test_dataframe.py b/tests/unit/standalone/test_dataframe.py index 7291c89..a382caf 100644 --- a/tests/unit/standalone/test_dataframe.py +++ b/tests/unit/standalone/test_dataframe.py @@ -10,7 +10,7 @@ def test_hash_select_expression(standalone_employee: StandaloneDataFrame): expression = exp.select("cola").from_("table") - assert standalone_employee._create_hash_from_expression(expression) == "t17051" + assert standalone_employee._create_hash_from_expression(expression) == "t17051938" def test_columns(standalone_employee: StandaloneDataFrame): @@ -20,9 +20,9 @@ def test_columns(standalone_employee: StandaloneDataFrame): def test_cache(standalone_employee: StandaloneDataFrame, compare_sql: t.Callable): df = standalone_employee.select("fname").cache() expected_statements = [ - "DROP VIEW IF EXISTS t31563", - "CACHE LAZY TABLE t31563 OPTIONS('storageLevel' = 'MEMORY_AND_DISK') AS SELECT CAST(`a1`.`fname` AS STRING) AS `fname` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)", - "SELECT `t31563`.`fname` AS `fname` FROM `t31563` AS `t31563`", + "DROP VIEW IF EXISTS t31563989", + "CACHE LAZY TABLE t31563989 OPTIONS('storageLevel' = 'MEMORY_AND_DISK') AS SELECT CAST(`a1`.`fname` AS STRING) AS `fname` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)", + "SELECT `t31563989`.`fname` AS `fname` FROM `t31563989` AS `t31563989`", ] compare_sql(df, expected_statements) @@ -30,9 +30,9 @@ def test_cache(standalone_employee: StandaloneDataFrame, compare_sql: t.Callable def test_persist_default(standalone_employee: StandaloneDataFrame, compare_sql: t.Callable): df = standalone_employee.select("fname").persist() expected_statements = [ - "DROP VIEW IF EXISTS t31563", - "CACHE LAZY TABLE t31563 OPTIONS('storageLevel' = 'MEMORY_AND_DISK_SER') AS SELECT CAST(`a1`.`fname` AS STRING) AS `fname` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)", - "SELECT `t31563`.`fname` AS `fname` FROM `t31563` AS `t31563`", + "DROP VIEW IF EXISTS t31563989", + "CACHE LAZY TABLE t31563989 OPTIONS('storageLevel' = 'MEMORY_AND_DISK_SER') AS SELECT CAST(`a1`.`fname` AS STRING) AS `fname` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)", + "SELECT `t31563989`.`fname` AS `fname` FROM `t31563989` AS `t31563989`", ] compare_sql(df, expected_statements) @@ -40,9 +40,9 @@ def test_persist_default(standalone_employee: StandaloneDataFrame, compare_sql: def test_persist_storagelevel(standalone_employee: StandaloneDataFrame, compare_sql: t.Callable): df = standalone_employee.select("fname").persist("DISK_ONLY_2") expected_statements = [ - "DROP VIEW IF EXISTS t31563", - "CACHE LAZY TABLE t31563 OPTIONS('storageLevel' = 'DISK_ONLY_2') AS SELECT CAST(`a1`.`fname` AS STRING) AS `fname` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)", - "SELECT `t31563`.`fname` AS `fname` FROM `t31563` AS `t31563`", + "DROP VIEW IF EXISTS t31563989", + "CACHE LAZY TABLE t31563989 OPTIONS('storageLevel' = 'DISK_ONLY_2') AS SELECT CAST(`a1`.`fname` AS STRING) AS `fname` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)", + "SELECT `t31563989`.`fname` AS `fname` FROM `t31563989` AS `t31563989`", ] compare_sql(df, expected_statements) diff --git a/tests/unit/standalone/test_dataframe_writer.py b/tests/unit/standalone/test_dataframe_writer.py index bca5c0b..9194be4 100644 --- a/tests/unit/standalone/test_dataframe_writer.py +++ b/tests/unit/standalone/test_dataframe_writer.py @@ -43,9 +43,9 @@ def test_insertInto_byName(standalone_employee: StandaloneDataFrame, compare_sql def test_insertInto_cache(standalone_employee: StandaloneDataFrame, compare_sql: t.Callable): df = standalone_employee.cache().write.insertInto("table_name") expected_statements = [ - "DROP VIEW IF EXISTS t12441", - "CACHE LAZY TABLE t12441 OPTIONS('storageLevel' = 'MEMORY_AND_DISK') AS SELECT `a1`.`employee_id` AS `employee_id`, CAST(`a1`.`fname` AS STRING) AS `fname`, CAST(`a1`.`lname` AS STRING) AS `lname`, `a1`.`age` AS `age`, `a1`.`store_id` AS `store_id` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)", - "INSERT INTO table_name SELECT `t12441`.`employee_id` AS `employee_id`, `t12441`.`fname` AS `fname`, `t12441`.`lname` AS `lname`, `t12441`.`age` AS `age`, `t12441`.`store_id` AS `store_id` FROM `t12441` AS `t12441`", + "DROP VIEW IF EXISTS t12441709", + "CACHE LAZY TABLE t12441709 OPTIONS('storageLevel' = 'MEMORY_AND_DISK') AS SELECT `a1`.`employee_id` AS `employee_id`, CAST(`a1`.`fname` AS STRING) AS `fname`, CAST(`a1`.`lname` AS STRING) AS `lname`, `a1`.`age` AS `age`, `a1`.`store_id` AS `store_id` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)", + "INSERT INTO table_name SELECT `t12441709`.`employee_id` AS `employee_id`, `t12441709`.`fname` AS `fname`, `t12441709`.`lname` AS `lname`, `t12441709`.`age` AS `age`, `t12441709`.`store_id` AS `store_id` FROM `t12441709` AS `t12441709`", ] compare_sql(df, expected_statements) @@ -94,9 +94,9 @@ def test_mode_override(standalone_employee: StandaloneDataFrame, compare_sql: t. def test_saveAsTable_cache(standalone_employee: StandaloneDataFrame, compare_sql: t.Callable): df = standalone_employee.cache().write.saveAsTable("table_name") expected_statements = [ - "DROP VIEW IF EXISTS t12441", - "CACHE LAZY TABLE t12441 OPTIONS('storageLevel' = 'MEMORY_AND_DISK') AS SELECT `a1`.`employee_id` AS `employee_id`, CAST(`a1`.`fname` AS STRING) AS `fname`, CAST(`a1`.`lname` AS STRING) AS `lname`, `a1`.`age` AS `age`, `a1`.`store_id` AS `store_id` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)", - "CREATE TABLE table_name AS SELECT `t12441`.`employee_id` AS `employee_id`, `t12441`.`fname` AS `fname`, `t12441`.`lname` AS `lname`, `t12441`.`age` AS `age`, `t12441`.`store_id` AS `store_id` FROM `t12441` AS `t12441`", + "DROP VIEW IF EXISTS t12441709", + "CACHE LAZY TABLE t12441709 OPTIONS('storageLevel' = 'MEMORY_AND_DISK') AS SELECT `a1`.`employee_id` AS `employee_id`, CAST(`a1`.`fname` AS STRING) AS `fname`, CAST(`a1`.`lname` AS STRING) AS `lname`, `a1`.`age` AS `age`, `a1`.`store_id` AS `store_id` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)", + "CREATE TABLE table_name AS SELECT `t12441709`.`employee_id` AS `employee_id`, `t12441709`.`fname` AS `fname`, `t12441709`.`lname` AS `lname`, `t12441709`.`age` AS `age`, `t12441709`.`store_id` AS `store_id` FROM `t12441709` AS `t12441709`", ] compare_sql(df, expected_statements) diff --git a/tests/unit/standalone/test_session.py b/tests/unit/standalone/test_session.py index 52d0048..dca0f31 100644 --- a/tests/unit/standalone/test_session.py +++ b/tests/unit/standalone/test_session.py @@ -110,7 +110,7 @@ def test_sql_with_aggs(standalone_session: StandaloneSession, compare_sql: t.Cal df = standalone_session.sql(query).groupBy(F.col("cola")).agg(F.sum("colb")) compare_sql( df, - "WITH t26614 AS (SELECT `table`.`cola` AS `cola`, `table`.`colb` AS `colb` FROM `table` AS `table`), t23454 AS (SELECT cola, colb FROM t26614) SELECT cola, SUM(colb) FROM t23454 GROUP BY cola", + "WITH t26614157 AS (SELECT `table`.`cola` AS `cola`, `table`.`colb` AS `colb` FROM `table` AS `table`), t38889420 AS (SELECT cola, colb FROM t26614157) SELECT cola, SUM(colb) FROM t38889420 GROUP BY cola", pretty=False, optimize=False, )