From f504265943d929bcf0ba3ed133de511eb601efd0 Mon Sep 17 00:00:00 2001 From: Subhash Bhushan Date: Mon, 17 Jul 2023 13:39:19 -0700 Subject: [PATCH] AVRO-1938: Add fingerprinting support to Python implementation (#1181) * AVRO-1938 Add support for fingerprinting schemas With this change, Schema fingerprints can be extracted by invoking the `fingerprint` method on the schema object. By default, fingerprints will be generated with the CRC-64 algorithm. Optinally, the algorithm can be supplied. All algorithms supported by hashlib are available, but Avro recommends using one among CRC-32, MD5, and SHA256 as per needs. * AVRO-1938 Fix issue with AbstractSet typecheck * Format with black * Freeze Supported Algorithms Set This commit addresses review comments and freezes the supported fingerprinting algorithms set. * Minor lint fix with black * Address Typecheck issues with Frozenset * Fold Fingerprint Mixin within Schema Addresses PR 1181 review comments. Methods within Fingerprint mixin have been made available at the module level, including static variables used in fingerprinting. This PR has been synced with latest master. * Add type hints to fingerprint methods/variables * Fix incorrect import sorting in schema.py to pass lint check * Address @kojiromike Jul 16 review comments * Address @kojiromike Jul 16 review comments - 2 * Address @kojiromike Jul 17 review comments * Fix black lint issue --- lang/py/avro/errors.py | 4 + lang/py/avro/schema.py | 81 +++++++++- lang/py/avro/test/test_schema.py | 244 +++++++++++++++++++++++++++++++ 3 files changed, 328 insertions(+), 1 deletion(-) diff --git a/lang/py/avro/errors.py b/lang/py/avro/errors.py index 2c7675131ef..b961a04ae43 100644 --- a/lang/py/avro/errors.py +++ b/lang/py/avro/errors.py @@ -120,3 +120,7 @@ class UsageError(RuntimeError, AvroException): class AvroRuntimeException(RuntimeError, AvroException): """Raised when compatibility parsing encounters an unknown type""" + + +class UnknownFingerprintAlgorithmException(AvroException): + """Raised when attempting to generate a fingerprint with an unknown algorithm""" diff --git a/lang/py/avro/schema.py b/lang/py/avro/schema.py index 464f8180c72..f852e146a3e 100644 --- a/lang/py/avro/schema.py +++ b/lang/py/avro/schema.py @@ -42,12 +42,23 @@ import collections import datetime import decimal +import hashlib import json import math import uuid import warnings +from functools import reduce from pathlib import Path -from typing import List, Mapping, MutableMapping, Optional, Sequence, Union, cast +from typing import ( + FrozenSet, + List, + Mapping, + MutableMapping, + Optional, + Sequence, + Union, + cast, +) import avro.constants import avro.errors @@ -104,6 +115,50 @@ def _is_timezone_aware_datetime(dt: datetime.datetime) -> bool: return dt.tzinfo is not None and dt.tzinfo.utcoffset(dt) is not None +# Fingerprint Constants +_EMPTY64_FINGERPRINT: int = 0xC15D213AA4D7A795 +_FINGERPRINT_TABLE: tuple = tuple(reduce(lambda fp, _: (fp >> 1) ^ (_EMPTY64_FINGERPRINT & -(fp & 1)), range(8), i) for i in range(256)) + + +# All algorithms guaranteed by hashlib are supported: +# - 'blake2b', +# - 'blake2s', +# - 'md5', +# - 'sha1', +# - 'sha224', +# - 'sha256', +# - 'sha384', +# - 'sha3_224', +# - 'sha3_256', +# - 'sha3_384', +# - 'sha3_512', +# - 'sha512', +# - 'shake_128', +# - 'shake_256' +SUPPORTED_ALGORITHMS: FrozenSet[str] = frozenset({"CRC-64-AVRO"} | hashlib.algorithms_guaranteed) + + +def _crc_64_fingerprint(data: bytes) -> bytes: + """The 64-bit Rabin Fingerprint. + + As described in the Avro specification. + + Args: + data: A bytes object containing the UTF-8 encoded parsing canonical + form of an Avro schema. + Returns: + A bytes object with a length of eight in little-endian format. + """ + result = _EMPTY64_FINGERPRINT + + for b in data: + result = (result >> 8) ^ _FINGERPRINT_TABLE[(result ^ b) & 0xFF] + + # Although not mentioned in the Avro specification, the Java + # implementation gives fingerprint bytes in little-endian order + return result.to_bytes(length=8, byteorder="little", signed=False) + + # # Base Classes # @@ -240,6 +295,30 @@ def __eq__(self, that: object) -> bool: Consider the mixins EqualByPropsMixin and EqualByJsonMixin """ + def fingerprint(self, algorithm="CRC-64-AVRO") -> bytes: + """ + Generate fingerprint for supplied algorithm. + + 'CRC-64-AVRO' will be used as the algorithm by default, but any + algorithm supported by hashlib (as can be referenced with + `hashlib.algorithms_guaranteed`) can be specified. + + `algorithm` param is used as an algorithm name, and NoSuchAlgorithmException + will be thrown if the algorithm is not among supported. + """ + schema = self.canonical_form.encode("utf-8") + + if algorithm == "CRC-64-AVRO": + return _crc_64_fingerprint(schema) + + if algorithm not in SUPPORTED_ALGORITHMS: + raise avro.errors.UnknownFingerprintAlgorithmException(f"Unknown Fingerprint Algorithm: {algorithm}") + + # Generate digests with hashlib for all other algorithms + # Lowercase algorithm to support algorithm strings sent by other languages like Java + h = hashlib.new(algorithm.lower(), schema) + return h.digest() + class NamedSchema(Schema): """Named Schemas specified in NAMED_TYPES.""" diff --git a/lang/py/avro/test/test_schema.py b/lang/py/avro/test/test_schema.py index c59ded8a73e..668ca8258f2 100644 --- a/lang/py/avro/test/test_schema.py +++ b/lang/py/avro/test/test_schema.py @@ -519,6 +519,204 @@ class InvalidTestSchema(TestSchema): ), ] + +# Fingerprint examples are in the form of tuples: +# - Value in Position 0 is schema +# - Value in Position 1 is an array of fingerprints: +# - Position 0 is CRC-64-AVRO fingerprint +# - Position 0 is MD5 fingerprint +# - Position 0 is SHA256 fingerprint +FINGERPRINT_EXAMPLES = [ + ('"int"', ["8f5c393f1ad57572", "ef524ea1b91e73173d938ade36c1db32", "3f2b87a9fe7cc9b13835598c3981cd45e3e355309e5090aa0933d7becb6fba45"]), + ('{"type": "int"}', ["8f5c393f1ad57572", "ef524ea1b91e73173d938ade36c1db32", "3f2b87a9fe7cc9b13835598c3981cd45e3e355309e5090aa0933d7becb6fba45"]), + ('"float"', ["90d7a83ecb027c4d", "50a6b9db85da367a6d2df400a41758a6", "1e71f9ec051d663f56b0d8e1fc84d71aa56ccfe9fa93aa20d10547a7abeb5cc0"]), + ( + '{"type": "float"}', + ["90d7a83ecb027c4d", "50a6b9db85da367a6d2df400a41758a6", "1e71f9ec051d663f56b0d8e1fc84d71aa56ccfe9fa93aa20d10547a7abeb5cc0"], + ), + ('"long"', ["b71df49344e154d0", "e1dd9a1ef98b451b53690370b393966b", "c32c497df6730c97fa07362aa5023f37d49a027ec452360778114cf427965add"]), + ( + '{"type": "long"}', + ["b71df49344e154d0", "e1dd9a1ef98b451b53690370b393966b", "c32c497df6730c97fa07362aa5023f37d49a027ec452360778114cf427965add"], + ), + ('"double"', ["7e95ab32c035758e", "bfc71a62f38b99d6a93690deeb4b3af6", "730a9a8c611681d7eef442e03c16c70d13bca3eb8b977bb403eaff52176af254"]), + ( + '{"type": "double"}', + ["7e95ab32c035758e", "bfc71a62f38b99d6a93690deeb4b3af6", "730a9a8c611681d7eef442e03c16c70d13bca3eb8b977bb403eaff52176af254"], + ), + ('"bytes"', ["651920c3da16c04f", "b462f06cb909be57c85008867784cde6", "9ae507a9dd39ee5b7c7e285da2c0846521c8ae8d80feeae5504e0c981d53f5fa"]), + ( + '{"type": "bytes"}', + ["651920c3da16c04f", "b462f06cb909be57c85008867784cde6", "9ae507a9dd39ee5b7c7e285da2c0846521c8ae8d80feeae5504e0c981d53f5fa"], + ), + ('"string"', ["c70345637248018f", "095d71cf12556b9d5e330ad575b3df5d", "e9e5c1c9e4f6277339d1bcde0733a59bd42f8731f449da6dc13010a916930d48"]), + ( + '{"type": "string"}', + ["c70345637248018f", "095d71cf12556b9d5e330ad575b3df5d", "e9e5c1c9e4f6277339d1bcde0733a59bd42f8731f449da6dc13010a916930d48"], + ), + ('"boolean"', ["64f7d4a478fc429f", "01f692b30d4a1c8a3e600b1440637f8f", "a5b031ab62bc416d720c0410d802ea46b910c4fbe85c50a946ccc658b74e677e"]), + ( + '{"type": "boolean"}', + ["64f7d4a478fc429f", "01f692b30d4a1c8a3e600b1440637f8f", "a5b031ab62bc416d720c0410d802ea46b910c4fbe85c50a946ccc658b74e677e"], + ), + ('"null"', ["8a8f25cce724dd63", "9b41ef67651c18488a8b08bb67c75699", "f072cbec3bf8841871d4284230c5e983dc211a56837aed862487148f947d1a1f"]), + ( + '{"type": "null"}', + ["8a8f25cce724dd63", "9b41ef67651c18488a8b08bb67c75699", "f072cbec3bf8841871d4284230c5e983dc211a56837aed862487148f947d1a1f"], + ), + ( + '{"type": "fixed", "name": "Test", "size": 1}', + ["6869897b4049355b", "db01bc515fcfcd2d4be82ed385288261", "f527116a6f44455697e935afc31dc60ad0f95caf35e1d9c9db62edb3ffeb9170"], + ), + ( + json.dumps({"type": "fixed", "name": "MyFixed", "namespace": "org.apache.hadoop.avro", "size": 1}), + ["fadbd138e85bdf45", "d74b3726484422711c465d49e857b1ba", "28e493a44771cecc5deca4bd938cdc3d5a24cfe1f3760bc938fa1057df6334fc"], + ), + ( + '{"type": "enum", "name": "Test", "symbols": ["A", "B"]}', + ["03a2f2c2e27f7a16", "d883f2a9b16ed085fcc5e4ca6c8f6ed1", "9b51286144f87ce5aebdc61ca834379effa5a41ce6ac0938630ff246297caca8"], + ), + ( + '{"type": "array", "items": "long"}', + ["715e2ea28bc91654", "c1c387e8d6a58f0df749b698991b1f43", "f78e954167feb23dcb1ce01e8463cebf3408e0a4259e16f24bd38f6d0f1d578b"], + ), + ( + json.dumps({"type": "array", "items": {"type": "enum", "name": "Test", "symbols": ["A", "B"]}}), + ["10d9ade1fa3a0387", "cfc7b861c7cfef082a6ef082948893fa", "0d8edd49d7f7e9553668f133577bc99f842852b55d9f84f1f7511e4961aa685c"], + ), + ( + '{"type": "map", "values": "long"}', + ["6f74f4e409b1334e", "32b3f1a3177a0e73017920f00448b56e", "b8fad07d458971a07692206b8a7cf626c86c62fe6bcff7c1b11bc7295de34853"], + ), + ( + json.dumps({"type": "map", "values": {"type": "enum", "name": "Test", "symbols": ["A", "B"]}}), + ["df2ab0626f6b812d", "c588da6ba99701c41e73fd30d23f994e", "3886747ed1669a8af476b549e97b34222afb2fed5f18bb27c6f367ea0351a576"], + ), + ( + '["string", "null", "long"]', + ["65a5be410d687566", "b11cf95f0a55dd55f9ee515a37bf937a", "ed8d254116441bb35e237ad0563cf5432b8c975334bd222c1ee84609435d95bb"], + ), + ( + json.dumps({"type": "record", "name": "Test", "fields": [{"name": "f", "type": "long"}]}), + ["ed94e5f5e6eb588e", "69531a03db788afe353244cd049b1e6d", "9670f15a8f96d23e92830d00b8bd57275e02e3e173ffef7c253c170b6beabeb8"], + ), + ( + json.dumps( + { + "type": "record", + "name": "Node", + "fields": [{"name": "label", "type": "string"}, {"name": "children", "type": {"type": "array", "items": "Node"}}], + } + ), + ["52cba544c3e756b7", "99625b0cc02050363e89ef66b0f406c9", "65d80dc8c95c98a9671d92cf0415edfabfee2cb058df2138606656cd6ae4dc59"], + ), + ( + json.dumps( + { + "type": "record", + "name": "Lisp", + "fields": [ + { + "name": "value", + "type": [ + "null", + "string", + {"type": "record", "name": "Cons", "fields": [{"name": "car", "type": "Lisp"}, {"name": "cdr", "type": "Lisp"}]}, + ], + } + ], + } + ), + ["68d91a23eda0b306", "9e1d0d15b52789fcb8e3a88b53059d5f", "e5ce4f4a15ce19fa1047cfe16a3b0e13a755db40f00f23284fdd376fc1c7dd21"], + ), + ( + json.dumps( + { + "type": "record", + "name": "HandshakeRequest", + "namespace": "org.apache.avro.ipc", + "fields": [ + {"name": "clientHash", "type": {"type": "fixed", "name": "MD5", "size": 16}}, + {"name": "clientProtocol", "type": ["null", "string"]}, + {"name": "serverHash", "type": "MD5"}, + {"name": "meta", "type": ["null", {"type": "map", "values": "bytes"}]}, + ], + } + ), + ["43818703b7b5d769", "16ded8b5027e80a17704c6565c0c3f1b", "6c317314687da52a85c813a7f0c92298a60b79625b9acc072e4d9e4256a1d800"], + ), + ( + json.dumps( + { + "type": "record", + "name": "HandshakeResponse", + "namespace": "org.apache.avro.ipc", + "fields": [ + {"name": "match", "type": {"type": "enum", "name": "HandshakeMatch", "symbols": ["BOTH", "CLIENT", "NONE"]}}, + {"name": "serverProtocol", "type": ["null", "string"]}, + {"name": "serverHash", "type": ["null", {"name": "MD5", "size": 16, "type": "fixed"}]}, + {"name": "meta", "type": ["null", {"type": "map", "values": "bytes"}]}, + ], + } + ), + ["00feee01de4ea50e", "afe529d01132daab7f4e2a6663e7a2f5", "a303cbbfe13958f880605d70c521a4b7be34d9265ac5a848f25916a67b11d889"], + ), + ( + json.dumps( + { + "type": "record", + "name": "Interop", + "namespace": "org.apache.avro", + "fields": [ + {"name": "intField", "type": "int"}, + {"name": "longField", "type": "long"}, + {"name": "stringField", "type": "string"}, + {"name": "boolField", "type": "boolean"}, + {"name": "floatField", "type": "float"}, + {"name": "doubleField", "type": "double"}, + {"name": "bytesField", "type": "bytes"}, + {"name": "nullField", "type": "null"}, + {"name": "arrayField", "type": {"type": "array", "items": "double"}}, + { + "name": "mapField", + "type": {"type": "map", "values": {"name": "Foo", "type": "record", "fields": [{"name": "label", "type": "string"}]}}, + }, + {"name": "unionField", "type": ["boolean", "double", {"type": "array", "items": "bytes"}]}, + {"name": "enumField", "type": {"type": "enum", "name": "Kind", "symbols": ["A", "B", "C"]}}, + {"name": "fixedField", "type": {"type": "fixed", "name": "MD5", "size": 16}}, + { + "name": "recordField", + "type": { + "type": "record", + "name": "Node", + "fields": [{"name": "label", "type": "string"}, {"name": "children", "type": {"type": "array", "items": "Node"}}], + }, + }, + ], + } + ), + ["e82c0a93a6a0b5a4", "994fea1a1be7ff8603cbe40c3bc7e4ca", "cccfd6e3f917cf53b0f90c206342e6703b0d905071f724a1c1f85b731c74058d"], + ), + ( + json.dumps( + { + "type": "record", + "name": "ipAddr", + "fields": [{"name": "addr", "type": [{"name": "IPv6", "type": "fixed", "size": 16}, {"name": "IPv4", "type": "fixed", "size": 4}]}], + } + ), + ["8d961b4e298a1844", "45d85c69b353a99b93d7c4f2fcf0c30d", "6f6fc8f685a4f07d99734946565d63108806d55a8620febea047cf52cb0ac181"], + ), + ( + json.dumps({"type": "record", "name": "TestDoc", "doc": "Doc string", "fields": [{"name": "name", "type": "string", "doc": "Doc String"}]}), + ["0e6660f02bcdc109", "f2da75f5131f5ab80629538287b8beb2", "0b3644f7aa5ca2fc4bad93ca2d3609c12aa9dbda9c15e68b34c120beff08e7b9"], + ), + ( + '{"type": "enum", "name": "Test", "symbols": ["A", "B"], "doc": "Doc String"}', + ["03a2f2c2e27f7a16", "d883f2a9b16ed085fcc5e4ca6c8f6ed1", "9b51286144f87ce5aebdc61ca834379effa5a41ce6ac0938630ff246297caca8"], + ), +] + EXAMPLES = PRIMITIVE_EXAMPLES EXAMPLES += FIXED_EXAMPLES EXAMPLES += ENUM_EXAMPLES @@ -634,13 +832,25 @@ def test_fixed_decimal_invalid_max_precision(self): def test_parse_invalid_symbol(self): """Disabling enumschema symbol validation should allow invalid symbols to pass.""" test_schema_string = json.dumps({"type": "enum", "name": "AVRO2174", "symbols": ["white space"]}) + with self.assertRaises(avro.errors.InvalidName, msg="When enum symbol validation is enabled, an invalid symbol should raise InvalidName."): avro.schema.parse(test_schema_string, validate_enum_symbols=True) + try: avro.schema.parse(test_schema_string, validate_enum_symbols=False) except avro.errors.InvalidName: # pragma: no coverage self.fail("When enum symbol validation is disabled, an invalid symbol should not raise InvalidName.") + def test_unsupported_fingerprint_algorithm(self): + s = avro.schema.parse('"int"') + self.assertRaises(avro.errors.UnknownFingerprintAlgorithmException, s.fingerprint, "foo") + + def test_less_popular_fingerprint_algorithm(self): + s = avro.schema.parse('"int"') + fingerprint = s.fingerprint("sha384") + hex_fingerprint = "".join(format(b, "02x") for b in fingerprint).zfill(16) + self.assertEqual(hex_fingerprint, "32ed5e4ac896570f044d1dab68f4c8ca9866ac06d22261f399316bf4799e16854750238085775107dfac905c82b2feaf") + class SchemaParseTestCase(unittest.TestCase): """Enable generating parse test cases over all the valid and invalid example schema.""" @@ -1181,6 +1391,39 @@ def test_large_record_interop(self): ) +class FingerprintTestCase(unittest.TestCase): + """ + Enable generating fingerprint test cases across algorithms. + + Fingerprint examples are in the form of tuples: + - Value in Position 0 is schema + - Value in Position 1 is an array of fingerprints: + - Position 0 is CRC-64-AVRO fingerprint + - Position 0 is MD5 fingerprint + - Position 0 is SHA256 fingerprint + """ + + def __init__(self, test_schema, fingerprints): + """Ignore the normal signature for unittest.TestCase because we are generating + many test cases from this one class. This is safe as long as the autoloader + ignores this class. The autoloader will ignore this class as long as it has + no methods starting with `test_`. + """ + super(FingerprintTestCase, self).__init__("validate_fingerprint") + self.test_schema = test_schema + self.fingerprints = fingerprints + + def _hex_fingerprint(self, fingerprint): + return "".join(format(b, "02x") for b in fingerprint).zfill(16) + + def validate_fingerprint(self): + """The string of a Schema should be parseable to the same Schema.""" + s = avro.schema.parse(self.test_schema) + self.assertEqual(self._hex_fingerprint(s.fingerprint()), self.fingerprints[0]) + self.assertEqual(self._hex_fingerprint(s.fingerprint("md5")), self.fingerprints[1]) + self.assertEqual(self._hex_fingerprint(s.fingerprint("sha256")), self.fingerprints[2]) + + def load_tests(loader, default_tests, pattern): """Generate test cases across many test schema.""" suite = unittest.TestSuite() @@ -1190,6 +1433,7 @@ def load_tests(loader, default_tests, pattern): suite.addTests(DocAttributesTestCase(ex) for ex in DOC_EXAMPLES) suite.addTests(OtherAttributesTestCase(ex) for ex in OTHER_PROP_EXAMPLES) suite.addTests(loader.loadTestsFromTestCase(CanonicalFormTestCase)) + suite.addTests(FingerprintTestCase(ex[0], ex[1]) for ex in FINGERPRINT_EXAMPLES) return suite