fix: properly implement and test explain (#246)

eakmanrq · Jan 17, 2025 · cf6d67f · cf6d67f
1 parent b7e15bc
commit cf6d67f
Show file tree

Hide file tree

Showing 10 changed files with 85 additions and 5 deletions.
diff --git a/sqlframe/base/dataframe.py b/sqlframe/base/dataframe.py
@@ -202,6 +202,7 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
     _na: t.Type[NA]
     _stat: t.Type[STAT]
     _group_data: t.Type[GROUP_DATA]
+    _EXPLAIN_PREFIX = "EXPLAIN"
 
     def __init__(
         self,
@@ -1144,6 +1145,18 @@ def dropna(
         final_df = filtered_df.select(*all_columns)
         return final_df
 
+    def _get_explain_plan_rows(self) -> t.List[Row]:
+        sql_queries = self.sql(
+            pretty=False, optimize=False, as_list=True, dialect=self.session.execution_dialect
+        )
+        if len(sql_queries) > 1:
+            raise ValueError("Cannot explain a DataFrame with multiple queries")
+        sql_query = " ".join([self._EXPLAIN_PREFIX, sql_queries[0]])
+        results = self.session._collect(sql_query)
+        if len(results) != 1:
+            raise ValueError("Got more than one result from explain query")
+        return results
+
     def explain(
         self, extended: t.Optional[t.Union[bool, str]] = None, mode: t.Optional[str] = None
     ) -> None:
@@ -1212,11 +1225,8 @@ def explain(
         ...Statistics...
         ...
         """
-        sql_queries = self.sql(pretty=False, optimize=False, as_list=True)
-        if len(sql_queries) > 1:
-            raise ValueError("Cannot explain a DataFrame with multiple queries")
-        sql_query = "EXPLAIN " + sql_queries[0]
-        self.session._execute(sql_query)
+        results = self._get_explain_plan_rows()
+        print(results[0][0])
 
     @operation(Operation.FROM)
     def fillna(

diff --git a/sqlframe/bigquery/dataframe.py b/sqlframe/bigquery/dataframe.py
@@ -72,3 +72,8 @@ def field_to_column(field: bigquery.SchemaField) -> CatalogColumn:
         sql = self.session._to_sql(self.expression)
         query_job = self.session._client.query(sql, job_config=job_config)
         return [field_to_column(field) for field in query_job.schema]
+
+    def explain(
+        self, extended: t.Optional[t.Union[bool, str]] = None, mode: t.Optional[str] = None
+    ) -> None:
+        raise NotImplementedError("BigQuery does not support EXPLAIN")
diff --git a/sqlframe/duckdb/dataframe.py b/sqlframe/duckdb/dataframe.py
@@ -46,6 +46,12 @@ class DuckDBDataFrame(
     _stat = DuckDBDataFrameStatFunctions
     _group_data = DuckDBGroupedData
 
+    def explain(
+        self, extended: t.Optional[t.Union[bool, str]] = None, mode: t.Optional[str] = None
+    ) -> None:
+        results = self._get_explain_plan_rows()
+        print(results[0][1])
+
     @t.overload
     def toArrow(self) -> ArrowTable: ...
 

diff --git a/sqlframe/snowflake/dataframe.py b/sqlframe/snowflake/dataframe.py
@@ -43,6 +43,7 @@ class SnowflakeDataFrame(
     _na = SnowflakeDataFrameNaFunctions
     _stat = SnowflakeDataFrameStatFunctions
     _group_data = SnowflakeGroupedData
+    _EXPLAIN_PREFIX = "EXPLAIN USING TEXT"
 
     @property
     def _typed_columns(self) -> t.List[CatalogColumn]:

diff --git a/tests/integration/engines/bigquery/test_bigquery_dataframe.py b/tests/integration/engines/bigquery/test_bigquery_dataframe.py
@@ -157,3 +157,8 @@ def test_schema_nested(bigquery_datatypes: BigQueryDataFrame):
     assert struct_fields[8].dataType == types.TimestampType()
     assert struct_fields[9].name == "boolean_col"
     assert struct_fields[9].dataType == types.BooleanType()
+
+
+def test_explain(bigquery_employee: BigQueryDataFrame):
+    with pytest.raises(NotImplementedError):
+        bigquery_employee.explain()
diff --git a/tests/integration/engines/databricks/test_databricks_dataframe.py b/tests/integration/engines/databricks/test_databricks_dataframe.py
@@ -167,3 +167,11 @@ def test_schema_nested(databricks_datatypes: DatabricksDataFrame):
     assert struct_fields[9].dataType == types.TimestampType()
     assert struct_fields[10].name == "boolean_col"
     assert struct_fields[10].dataType == types.BooleanType()
+
+
+def test_explain(databricks_employee: DatabricksDataFrame, capsys):
+    databricks_employee.explain()
+    output = capsys.readouterr().out.strip()
+    assert "== Physical Plan ==" in output
+    assert "LocalTableScan" in output
+    assert "== Photon Explanation ==" in output
diff --git a/tests/integration/engines/duck/test_duckdb_dataframe.py b/tests/integration/engines/duck/test_duckdb_dataframe.py
@@ -252,3 +252,17 @@ def test_to_arrow_batch(duckdb_employee: DuckDBDataFrame):
     assert fifth_batch.column(4).to_pylist() == [100]
     with pytest.raises(StopIteration):
         record_batch_reader.read_next_batch()
+
+
+def test_explain(duckdb_employee: DuckDBDataFrame, capsys):
+    duckdb_employee.explain()
+    assert (
+        capsys.readouterr().out.strip()
+        == """
+┌───────────────────────────┐
+│      COLUMN_DATA_SCAN     │
+│    ────────────────────   │
+│          ~5 Rows          │
+└───────────────────────────┘
+""".strip()
+    )
diff --git a/tests/integration/engines/postgres/test_postgres_dataframe.py b/tests/integration/engines/postgres/test_postgres_dataframe.py
@@ -120,3 +120,11 @@ def test_schema_nested(postgres_datatypes: PostgresDataFrame):
     assert struct_fields[6].dataType == types.TimestampType()
     assert struct_fields[7].name == "boolean_col"
     assert struct_fields[7].dataType == types.BooleanType()
+
+
+def test_explain(postgres_employee: PostgresDataFrame, capsys):
+    postgres_employee.explain()
+    assert (
+        capsys.readouterr().out.strip()
+        == """Values Scan on "*VALUES*"  (cost=0.00..0.06 rows=5 width=76)""".strip()
+    )
diff --git a/tests/integration/engines/snowflake/test_snowflake_dataframe.py b/tests/integration/engines/snowflake/test_snowflake_dataframe.py
@@ -156,3 +156,19 @@ def test_schema_nested(snowflake_datatypes: SnowflakeDataFrame):
     assert struct_fields[9].dataType == types.TimestampType()
     assert struct_fields[10].name == "boolean_col"
     assert struct_fields[10].dataType == types.BooleanType()
+
+
+def test_explain(snowflake_employee: SnowflakeDataFrame, capsys):
+    snowflake_employee.explain()
+    assert (
+        capsys.readouterr().out.strip()
+        == """
+GlobalStats:
+    partitionsTotal=0
+    partitionsAssigned=0
+    bytesAssigned=0
+Operations:
+1:0     ->Result  A1.EMPLOYEE_ID, A1.FNAME, A1.LNAME, A1.AGE, A1.STORE_ID  
+1:1          ->ValuesClause  (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100)
+""".strip()
+    )
diff --git a/tests/integration/engines/spark/test_spark_dataframe.py b/tests/integration/engines/spark/test_spark_dataframe.py
@@ -163,3 +163,10 @@ def test_schema_nested(spark_datatypes: SparkDataFrame):
     assert struct_fields[9].dataType == types.TimestampType()
     assert struct_fields[10].name == "boolean_col"
     assert struct_fields[10].dataType == types.BooleanType()
+
+
+def test_explain(spark_employee: SparkDataFrame, capsys):
+    spark_employee.explain()
+    output = capsys.readouterr().out.strip()
+    assert "== Physical Plan ==" in output
+    assert "LocalTableScan" in output