Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Datetime Logger #1

Merged
merged 4 commits into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ from snsynth.transform import (
OneHotEncoder,
StandardScaler,
)
from snsynth.transform.datetime import DateTimeTransformer


constraints = {
"id": AnonymizationTransformer("uuid4"),
Expand All @@ -55,6 +57,7 @@ constraints = {
),
"rank": LabelTransformer(nullable=False),
"job": DropTransformer(),
"date": DateTimeTransformer(epoch="1993-06-04"),
}
```

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
setup(
name="smartnoise_synth_logger",
packages=find_packages(),
version="0.0.3",
version="0.0.4",
description="A logger wrapper for Smartnoise Synth Table Transformer",
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down
2 changes: 2 additions & 0 deletions smartnoise_synth_logger/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

SSYNTH = "smartnoise-synth"
SSYNTH_TRANSFORMER = "_ssynth_transformer:"
SSYNTH_DATETIME = "_ssynth_datetime_transformer:"

ANON_PARAM = "fake"

Expand All @@ -21,3 +22,4 @@ class Transformers(StrEnum):

CHAIN = "ChainTransformer"
ANONIMIZATION = "AnonymizationTransformer"
DATETIME = "DateTimeTransformer"
15 changes: 12 additions & 3 deletions smartnoise_synth_logger/deserialise.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from smartnoise_synth_logger.constants import (
JsonBodyKey,
SSYNTH,
SSYNTH_DATETIME,
SSYNTH_TRANSFORMER,
)

Expand Down Expand Up @@ -33,11 +34,19 @@ def object_hook(self, dct: dict) -> dict:
"""
for k, v in dct.items():
if isinstance(v, str):
nb_letters = len(SSYNTH_TRANSFORMER)
if v[:nb_letters] == SSYNTH_TRANSFORMER:
if v[: len(SSYNTH_TRANSFORMER)] == SSYNTH_TRANSFORMER:
try:
dct[k] = getattr(
snsynth.transform, v[nb_letters:] # noqa E203
snsynth.transform,
v[len(SSYNTH_TRANSFORMER) :], # noqa E203
)
except Exception as e:
raise ValueError(e) from e
elif v[: len(SSYNTH_DATETIME)] == SSYNTH_DATETIME:
try:
dct[k] = getattr(
snsynth.transform.datetime,
v[len(SSYNTH_DATETIME) :], # noqa E203
)
except Exception as e:
raise ValueError(e) from e
Expand Down
68 changes: 47 additions & 21 deletions smartnoise_synth_logger/serialise.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
ANON_PARAM,
JsonBodyKey,
SSYNTH,
SSYNTH_DATETIME,
SSYNTH_TRANSFORMER,
Transformers,
)
Expand All @@ -19,16 +20,49 @@ def get_filtered_params(obj) -> dict:

def handle_chain_transformer(col_constraints: dict) -> dict:
"""Handle ChainTransformer-specific logic."""
transformers = col_constraints.transformers
transformers_list = []
for t in col_constraints.transformers:
operator_name = t.__class__.__name__

if operator_name == Transformers.DATETIME:
transformer_dict = handle_datetime_transformer(t)
else:
transformer_dict = handle_default_transformer(t)

transformers_list.append(transformer_dict)

return {
JsonBodyKey.TYPE: SSYNTH_TRANSFORMER + Transformers.CHAIN,
JsonBodyKey.PARAM: [
{
JsonBodyKey.TYPE: SSYNTH_TRANSFORMER + t.__class__.__name__,
JsonBodyKey.PARAM: get_filtered_params(t),
}
for t in transformers
],
JsonBodyKey.PARAM: transformers_list,
}


def handle_datetime_transformer(col_constraints: dict) -> dict:
"""Handle DatetimeTransformer-specific logic."""
operator_name = col_constraints.__class__.__name__
datetime_params = get_filtered_params(col_constraints)
datetime_params["epoch"] = datetime_params["epoch"].isoformat()
return {
JsonBodyKey.TYPE: SSYNTH_DATETIME + operator_name,
JsonBodyKey.PARAM: datetime_params,
}


def handle_anon_transformer(col_constraints: dict) -> dict:
"""Handle AnonymisationTransformer-specific logic."""
operator_name = col_constraints.__class__.__name__
return {
JsonBodyKey.TYPE: SSYNTH_TRANSFORMER + operator_name,
JsonBodyKey.PARAM: {ANON_PARAM: col_constraints.fake.__name__},
}


def handle_default_transformer(col_constraints: dict) -> dict:
"""Handle default transformer logic."""
operator_name = col_constraints.__class__.__name__
return {
JsonBodyKey.TYPE: SSYNTH_TRANSFORMER + operator_name,
JsonBodyKey.PARAM: get_filtered_params(col_constraints),
}


Expand Down Expand Up @@ -62,19 +96,11 @@ def serialise_constraints(constraints: dict) -> str:
if operator_name == Transformers.CHAIN:
transformer_dict = handle_chain_transformer(col_constraints)
elif operator_name == Transformers.ANONIMIZATION:
transformer_dict = {
JsonBodyKey.TYPE: SSYNTH_TRANSFORMER
+ Transformers.ANONIMIZATION,
JsonBodyKey.PARAM: {
ANON_PARAM: col_constraints.fake.__name__
},
}
else: # default
transformer_dict = {
JsonBodyKey.TYPE: SSYNTH_TRANSFORMER
+ col_constraints.__class__.__name__,
JsonBodyKey.PARAM: get_filtered_params(col_constraints),
}
transformer_dict = handle_anon_transformer(col_constraints)
elif operator_name == Transformers.DATETIME:
transformer_dict = handle_datetime_transformer(col_constraints)
else:
transformer_dict = handle_default_transformer(col_constraints)

json_body[JsonBodyKey.CONSTRAINTS][col_name] = transformer_dict

Expand Down
48 changes: 40 additions & 8 deletions tests/test_de_serialisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
OneHotEncoder,
StandardScaler,
)
from snsynth.transform.datetime import DateTimeTransformer
from smartnoise_synth_logger import (
deserialise_constraints,
serialise_constraints,
Expand Down Expand Up @@ -40,6 +41,28 @@ def test_anon_str_serialize():
assert result_json == expected_json_updated


def test_datetime_serialize():
# No param
example_constraints = {"birthdays": DateTimeTransformer()}
result_json = serialise_constraints(example_constraints)
expected_json = """{"module": "smartnoise-synth", "version": "1.0.4", "constraints": {"birthdays": {"type": "_ssynth_datetime_transformer:DateTimeTransformer", "params": {"epoch": "1970-01-01T00:00:00"}}}}""" # noqa
expected_json_updated = expected_json.replace(
"1.0.4", pkg_resources.get_distribution(SSYNTH).version

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice! i guess we could use the same logic for the measurement pipeline in our test with opendp. Instead of updating each time we use a new version.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah yes ^^ let's be lazy

)
assert result_json == expected_json_updated

# Start epoch
example_constraints = {
"birthdays": DateTimeTransformer(epoch="1900-01-21")
}
result_json = serialise_constraints(example_constraints)
expected_json = """{"module": "smartnoise-synth", "version": "1.0.4", "constraints": {"birthdays": {"type": "_ssynth_datetime_transformer:DateTimeTransformer", "params": {"epoch": "1900-01-21T00:00:00"}}}}""" # noqa
expected_json_updated = expected_json.replace(
"1.0.4", pkg_resources.get_distribution(SSYNTH).version
)
assert result_json == expected_json_updated


def test_chain_serialize():
example_constraints = {
"income": ChainTransformer(
Expand Down Expand Up @@ -82,13 +105,10 @@ def test_serialize():
),
"rank": LabelTransformer(nullable=False),
"job": DropTransformer(),
# "date": ChainTransformer(
# [DateTimeTransformer(), MinMaxTransformer(nullable=False)]
# ),
"date": DateTimeTransformer(epoch="1993-06-04"),
}
result_json = serialise_constraints(example_constraints)

expected_json = """{"module": "smartnoise-synth", "version": "1.0.4", "constraints": {"id": {"type": "_ssynth_transformer:AnonymizationTransformer", "params": {"fake": "email"}}, "income": {"type": "_ssynth_transformer:ChainTransformer", "params": [{"type": "_ssynth_transformer:LogTransformer", "params": {}}, {"type": "_ssynth_transformer:BinTransformer", "params": {"lower": 0, "upper": 50, "epsilon": 0.0, "bins": 20, "nullable": false, "odometer": null}}]}, "height": {"type": "_ssynth_transformer:ChainTransformer", "params": [{"type": "_ssynth_transformer:StandardScaler", "params": {"lower": 0, "upper": 1, "epsilon": 0.0, "nullable": false, "odometer": null}}, {"type": "_ssynth_transformer:BinTransformer", "params": {"lower": 0, "upper": 1, "epsilon": 0.0, "bins": 20, "nullable": false, "odometer": null}}]}, "weight": {"type": "_ssynth_transformer:ChainTransformer", "params": [{"type": "_ssynth_transformer:ClampTransformer", "params": {"upper": 200, "lower": 10}}, {"type": "_ssynth_transformer:BinTransformer", "params": {"lower": null, "upper": null, "epsilon": 0.0, "bins": 20, "nullable": false, "odometer": null}}]}, "age": {"type": "_ssynth_transformer:MinMaxTransformer", "params": {"lower": 0, "upper": 100, "epsilon": 0.0, "negative": true, "nullable": false, "odometer": null}}, "sex": {"type": "_ssynth_transformer:ChainTransformer", "params": [{"type": "_ssynth_transformer:LabelTransformer", "params": {"nullable": true}}, {"type": "_ssynth_transformer:OneHotEncoder", "params": {}}]}, "rank": {"type": "_ssynth_transformer:LabelTransformer", "params": {"nullable": false}}, "job": {"type": "_ssynth_transformer:DropTransformer", "params": {}}}}""" # noqa
expected_json = """{"module": "smartnoise-synth", "version": "1.0.4", "constraints": {"id": {"type": "_ssynth_transformer:AnonymizationTransformer", "params": {"fake": "email"}}, "income": {"type": "_ssynth_transformer:ChainTransformer", "params": [{"type": "_ssynth_transformer:LogTransformer", "params": {}}, {"type": "_ssynth_transformer:BinTransformer", "params": {"lower": 0, "upper": 50, "epsilon": 0.0, "bins": 20, "nullable": false, "odometer": null}}]}, "height": {"type": "_ssynth_transformer:ChainTransformer", "params": [{"type": "_ssynth_transformer:StandardScaler", "params": {"lower": 0, "upper": 1, "epsilon": 0.0, "nullable": false, "odometer": null}}, {"type": "_ssynth_transformer:BinTransformer", "params": {"lower": 0, "upper": 1, "epsilon": 0.0, "bins": 20, "nullable": false, "odometer": null}}]}, "weight": {"type": "_ssynth_transformer:ChainTransformer", "params": [{"type": "_ssynth_transformer:ClampTransformer", "params": {"upper": 200, "lower": 10}}, {"type": "_ssynth_transformer:BinTransformer", "params": {"lower": null, "upper": null, "epsilon": 0.0, "bins": 20, "nullable": false, "odometer": null}}]}, "age": {"type": "_ssynth_transformer:MinMaxTransformer", "params": {"lower": 0, "upper": 100, "epsilon": 0.0, "negative": true, "nullable": false, "odometer": null}}, "sex": {"type": "_ssynth_transformer:ChainTransformer", "params": [{"type": "_ssynth_transformer:LabelTransformer", "params": {"nullable": true}}, {"type": "_ssynth_transformer:OneHotEncoder", "params": {}}]}, "rank": {"type": "_ssynth_transformer:LabelTransformer", "params": {"nullable": false}}, "job": {"type": "_ssynth_transformer:DropTransformer", "params": {}}, "date": {"type": "_ssynth_datetime_transformer:DateTimeTransformer", "params": {"epoch": "1993-06-04T00:00:00"}}}}""" # noqa
expected_json_updated = expected_json.replace(
"1.0.4", pkg_resources.get_distribution(SSYNTH).version
)
Expand All @@ -109,6 +129,20 @@ def test_anon_serialize_deserialise():
assert e_v.__class__.__name__ == de_v.__class__.__name__


def test_datetime_serialize_deserialise():
example_constraints = {
"birthdays": DateTimeTransformer(epoch="1900-01-21")
}
serialised = serialise_constraints(example_constraints)
deserialised = deserialise_constraints(serialised)

for (e_k, e_v), (de_k, de_v) in zip(
example_constraints.items(), deserialised.items()
):
assert e_k == de_k
assert e_v.__class__.__name__ == de_v.__class__.__name__


def test_serialize_deserialise():
example_constraints = {
"id": AnonymizationTransformer("email"),
Expand All @@ -133,9 +167,7 @@ def test_serialize_deserialise():
),
"rank": LabelTransformer(nullable=False),
"job": DropTransformer(),
# "date": ChainTransformer(
# [DateTimeTransformer(), MinMaxTransformer(nullable=False)]
# ),
"date": DateTimeTransformer(epoch="1993-06-04"),

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure to fully understand the epoch. For what i understand, its the date of reference during the transformation from date to float (nb of days since reference - 1970-01-01 being 0 by default). Is it only testing this aspect?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here it is testing that the serialiser works also when given another start date of reference, so from what I understand in this case 0 would start the "1993-06-04".
But I am just testing the de/serialiser works not the underlying library

}
serialised = serialise_constraints(example_constraints)
deserialised = deserialise_constraints(serialised)
Expand Down
Loading