From e7220f4354ec11a68ca0b14f73dee6b030720c2b Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Fri, 23 Aug 2024 12:00:41 -0500 Subject: [PATCH 1/8] Remove outdated test YAML file --- tests/data/test-merge.yml | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 tests/data/test-merge.yml diff --git a/tests/data/test-merge.yml b/tests/data/test-merge.yml deleted file mode 100644 index 9fac5599..00000000 --- a/tests/data/test-merge.yml +++ /dev/null @@ -1,12 +0,0 @@ ---- -"@graph": - - "@id": "#sdqa_Threshold.createdDate" - length: 6 - - "@id": "#sdqa_Threshold" - indexes: - - name: IDX_sdqaThreshold_metricId - "@id": "#IDX_sdqaThreshold_metricId" - columns: ["#sdqa_Threshold.sdqa_metricId"] - - name: IDX_sdqaThreshold_createdDate - "@id": "#IDX_sdqaThreshold_createdDate" - columns: ["#sdqa_Threshold.createdDate"] From c188af37a37b7fcad6d2d6ed14771ad98a2db67f Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Thu, 15 Aug 2024 15:51:41 -0500 Subject: [PATCH 2/8] Add Schema validator that automatically generates object IDs Implement a pre-validator which creates object IDs where they are missing on Schema objects. The values of the generated IDs for schema, table and column objects are constructed based on the common usage patterns from SDM Schemas. Constraints and indexes use the name field as the ID. The `--id-generation` flag must be used from the command-line to enable this feature, as it is off by default. --- python/felis/datamodel.py | 54 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/python/felis/datamodel.py b/python/felis/datamodel.py index a83309c9..b97dd10f 100644 --- a/python/felis/datamodel.py +++ b/python/felis/datamodel.py @@ -723,6 +723,55 @@ class Schema(BaseObject): id_map: dict[str, Any] = Field(default_factory=dict, exclude=True) """Map of IDs to objects.""" + @model_validator(mode="before") + @classmethod + def generate_ids(cls, values: dict[str, Any], info: ValidationInfo) -> dict[str, Any]: + """Generate IDs for objects that do not have them. + + Parameters + ---------- + values + The values of the schema. + info + Validation context used to determine if ID generation is enabled. + + Returns + ------- + `dict` [ `str`, `Any` ] + The values of the schema with generated IDs. + """ + context = info.context + if not context or not context.get("id_generation", False): + logger.debug("Skipping ID generation") + return values + schema_name = values["name"] + if "@id" not in values: + values["@id"] = f"#{schema_name}" + logger.debug(f"Generated ID '{values['@id']}' for schema '{schema_name}'") + if "tables" in values: + for table in values["tables"]: + if "@id" not in table: + table["@id"] = f"#{table['name']}" + logger.debug(f"Generated ID '{table['@id']}' for table '{table['name']}'") + if "columns" in table: + for column in table["columns"]: + if "@id" not in column: + column["@id"] = f"#{table['name']}.{column['name']}" + logger.debug(f"Generated ID '{column['@id']}' for column '{column['name']}'") + if "constraints" in table: + for constraint in table["constraints"]: + if "@id" not in constraint: + constraint["@id"] = f"#{constraint['name']}" + logger.debug( + f"Generated ID '{constraint['@id']}' for constraint '{constraint['name']}'" + ) + if "indexes" in table: + for index in table["indexes"]: + if "@id" not in index: + index["@id"] = f"#{index['name']}" + logger.debug(f"Generated ID '{index['@id']}' for index '{index['name']}'") + return values + @field_validator("tables", mode="after") @classmethod def check_unique_table_names(cls, tables: list[Table]) -> list[Table]: @@ -777,6 +826,11 @@ def check_tap_table_indexes(self, info: ValidationInfo) -> Schema: def check_unique_constraint_names(self: Schema) -> Schema: """Check for duplicate constraint names in the schema. + Returns + ------- + `Schema` + The schema being validated. + Raises ------ ValueError From 50fbf75de6e873b8faf05421ff839870799e2e18 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Fri, 23 Aug 2024 12:58:40 -0500 Subject: [PATCH 3/8] Add support for ID generation to the CLI Add a top-level flag for enabling ID generation and pass it to the other commands via the Click context. --- python/felis/cli.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/python/felis/cli.py b/python/felis/cli.py index 4e193956..36727ebe 100644 --- a/python/felis/cli.py +++ b/python/felis/cli.py @@ -62,8 +62,16 @@ envvar="FELIS_LOGFILE", help="Felis log file path", ) -def cli(log_level: str, log_file: str | None) -> None: +@click.option( + "--id-generation", is_flag=True, help="Generate IDs for all objects that do not have them", default=False +) +@click.pass_context +def cli(ctx: click.Context, log_level: str, log_file: str | None, id_generation: bool) -> None: """Felis command line tools""" + ctx.ensure_object(dict) + ctx.obj["id_generation"] = id_generation + if ctx.obj["id_generation"]: + logger.info("ID generation is enabled") if log_file: logging.basicConfig(filename=log_file, level=log_level) else: @@ -88,7 +96,9 @@ def cli(log_level: str, log_file: str | None) -> None: ) @click.option("--ignore-constraints", is_flag=True, help="Ignore constraints when creating tables") @click.argument("file", type=click.File()) +@click.pass_context def create( + ctx: click.Context, engine_url: str, schema_name: str | None, initialize: bool, @@ -124,7 +134,7 @@ def create( """ try: yaml_data = yaml.safe_load(file) - schema = Schema.model_validate(yaml_data) + schema = Schema.model_validate(yaml_data, context={"id_generation": ctx.obj["id_generation"]}) url = make_url(engine_url) if schema_name: logger.info(f"Overriding schema name with: {schema_name}") @@ -355,7 +365,9 @@ def load_tap( default=False, ) @click.argument("files", nargs=-1, type=click.File()) +@click.pass_context def validate( + ctx: click.Context, check_description: bool, check_redundant_datatypes: bool, check_tap_table_indexes: bool, @@ -402,6 +414,7 @@ def validate( "check_redundant_datatypes": check_redundant_datatypes, "check_tap_table_indexes": check_tap_table_indexes, "check_tap_principal": check_tap_principal, + "id_generation": ctx.obj["id_generation"], }, ) except ValidationError as e: From 24e7ce5cd1c0ab360cf9dec918fb1831ddcd6ded Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Fri, 23 Aug 2024 13:00:12 -0500 Subject: [PATCH 4/8] Add schema for testing ID generation --- tests/data/test_id_generation.yaml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 tests/data/test_id_generation.yaml diff --git a/tests/data/test_id_generation.yaml b/tests/data/test_id_generation.yaml new file mode 100644 index 00000000..6d559bd5 --- /dev/null +++ b/tests/data/test_id_generation.yaml @@ -0,0 +1,23 @@ +name: test_id_generation +description: Test schema for id generation. +tables: +- name: test_table + primaryKey: "#test_table.test_column1" + mysql:engine: MyISAM + columns: + - name: test_column1 + datatype: int + description: Test column. + - name: test_column2 + datatype: string + description: Test column. + length: 30 + indexes: + - name: test_index + columns: + - test_column1 + constraints: + - name: test_constraint + "@type": Unique + columns: + - test_column2 From 6e1d33c447e28d5f72d4631f9fe511d26b556e2b Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Fri, 23 Aug 2024 13:00:23 -0500 Subject: [PATCH 5/8] Add test of ID generation Load a schema without IDs and set them, and load the same schema with ID generation turned off to check that an exception is thrown. --- tests/test_datamodel.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_datamodel.py b/tests/test_datamodel.py index 4db5369b..4ab2a6cf 100644 --- a/tests/test_datamodel.py +++ b/tests/test_datamodel.py @@ -484,6 +484,19 @@ def test_model_validate(self) -> None: data = yaml.safe_load(test_yaml) Schema.model_validate(data) + def test_id_generation(self) -> None: + """Test ID generation.""" + test_path = os.path.join(TESTDIR, "data", "test_id_generation.yaml") + with open(test_path) as test_yaml: + yaml_data = yaml.safe_load(test_yaml) + # Generate IDs for objects in the test schema. + Schema.model_validate(yaml_data, context={"id_generation": True}) + with open(test_path) as test_yaml: + yaml_data = yaml.safe_load(test_yaml) + # Test that an error is raised when id generation is disabled. + with self.assertRaises(ValidationError): + Schema.model_validate(yaml_data, context={"id_generation": False}) + class SchemaVersionTest(unittest.TestCase): """Test the schema version.""" From 4920eebdbbbeea0cb27c9915759cdacc747d27e3 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Fri, 23 Aug 2024 13:14:30 -0500 Subject: [PATCH 6/8] Add tests for using the ID generation flag from the command line --- tests/test_cli.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index d9a4bbcc..16077e58 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -31,7 +31,6 @@ TESTDIR = os.path.abspath(os.path.dirname(__file__)) TEST_YAML = os.path.join(TESTDIR, "data", "test.yml") -TEST_MERGE_YAML = os.path.join(TESTDIR, "data", "test-merge.yml") class CliTestCase(unittest.TestCase): @@ -126,6 +125,22 @@ def test_validate_default(self) -> None: result = runner.invoke(cli, ["validate", TEST_YAML], catch_exceptions=False) self.assertEqual(result.exit_code, 0) + def test_id_generation(self) -> None: + """Test the ``--id-generation`` flag.""" + test_yaml = os.path.join(TESTDIR, "data", "test_id_generation.yaml") + runner = CliRunner() + result = runner.invoke(cli, ["--id-generation", "validate", test_yaml], catch_exceptions=False) + self.assertEqual(result.exit_code, 0) + + def test_no_id_generation(self) -> None: + """Test that loading a schema without IDs fails if ID generation is not + enabled. + """ + test_yaml = os.path.join(TESTDIR, "data", "test_id_generation.yaml") + runner = CliRunner() + result = runner.invoke(cli, ["validate", test_yaml], catch_exceptions=False) + self.assertNotEqual(result.exit_code, 0) + def test_validation_flags(self) -> None: """Test schema validation flags.""" runner = CliRunner() From 87dfdbeb9f646aba57016e67c8b9c5fa1f9b9ef7 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Mon, 26 Aug 2024 13:16:38 -0500 Subject: [PATCH 7/8] Add validator that checks if index names are unique within the schema --- python/felis/datamodel.py | 30 ++++++++++++++++++++++++++++++ tests/test_datamodel.py | 11 +++++++++++ 2 files changed, 41 insertions(+) diff --git a/python/felis/datamodel.py b/python/felis/datamodel.py index b97dd10f..8e793a3a 100644 --- a/python/felis/datamodel.py +++ b/python/felis/datamodel.py @@ -852,6 +852,36 @@ def check_unique_constraint_names(self: Schema) -> Schema: return self + @model_validator(mode="after") + def check_unique_index_names(self: Schema) -> Schema: + """Check for duplicate index names in the schema. + + Returns + ------- + `Schema` + The schema being validated. + + Raises + ------ + ValueError + Raised if duplicate index names are found in the schema. + """ + index_names = set() + duplicate_names = [] + + for table in self.tables: + for index in table.indexes: + index_name = index.name + if index_name in index_names: + duplicate_names.append(index_name) + else: + index_names.add(index_name) + + if duplicate_names: + raise ValueError(f"Duplicate index names found in schema: {duplicate_names}") + + return self + def _create_id_map(self: Schema) -> Schema: """Create a map of IDs to objects. diff --git a/tests/test_datamodel.py b/tests/test_datamodel.py index 4ab2a6cf..67681099 100644 --- a/tests/test_datamodel.py +++ b/tests/test_datamodel.py @@ -478,6 +478,17 @@ def test_check_unique_constraint_names(self) -> None: with self.assertRaises(ValidationError): Schema(name="testSchema", id="#test_id", tables=[test_tbl]) + def test_check_unique_index_names(self) -> None: + """Test that index names are unique.""" + test_col = Column(name="test_column1", id="#test_table#test_column1", datatype="int") + test_col2 = Column(name="test_column2", id="##test_table#test_column2", datatype="string", length=256) + test_tbl = Table(name="test_table", id="#test_table", columns=[test_col, test_col2]) + test_idx = Index(name="idx_test", id="#idx_test", columns=[test_col.id]) + test_idx2 = Index(name="idx_test", id="#idx_test2", columns=[test_col2.id]) + test_tbl.indexes = [test_idx, test_idx2] + with self.assertRaises(ValidationError): + Schema(name="test_schema", id="#test-schema", tables=[test_tbl]) + def test_model_validate(self) -> None: """Load a YAML test file and validate the schema data model.""" with open(TEST_YAML) as test_yaml: From 0da628cadafb08c637f80399192f11ee0f5819ad Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Mon, 26 Aug 2024 14:08:36 -0500 Subject: [PATCH 8/8] Add news fragment --- docs/changes/DM-45938.feature.rst | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 docs/changes/DM-45938.feature.rst diff --git a/docs/changes/DM-45938.feature.rst b/docs/changes/DM-45938.feature.rst new file mode 100644 index 00000000..c63afef2 --- /dev/null +++ b/docs/changes/DM-45938.feature.rst @@ -0,0 +1,4 @@ +Added automatic ID generation for objects in Felis schemas when the `--id-generation` flag is included on the command line. +This is supported for the `create` and `validate` commands. + +Also added a Schema validator that checks if index names are unique.