Skip to content

Commit

Permalink
AVRO-1938: Add fingerprinting support to Python implementation (#1181)
Browse files Browse the repository at this point in the history
* AVRO-1938 Add support for fingerprinting schemas

With this change, Schema fingerprints can be extracted by
invoking the `fingerprint` method on the schema object. By default,
fingerprints will be generated with the CRC-64 algorithm. Optinally,
the algorithm can be supplied.

All algorithms supported by hashlib are available, but Avro
recommends using one among CRC-32, MD5, and SHA256 as per needs.

* AVRO-1938 Fix issue with AbstractSet typecheck

* Format with black

* Freeze Supported Algorithms Set

This commit addresses review comments and freezes the supported
fingerprinting algorithms set.

* Minor lint fix with black

* Address Typecheck issues with Frozenset

* Fold Fingerprint Mixin within Schema

Addresses PR 1181 review comments. Methods within Fingerprint mixin
have been made available at the module level, including static
variables used in fingerprinting. This PR has been synced with latest
master.

* Add type hints to fingerprint methods/variables

* Fix incorrect import sorting in schema.py to pass lint check

* Address @kojiromike Jul 16 review comments

* Address @kojiromike Jul 16 review comments - 2

* Address @kojiromike Jul 17 review comments

* Fix black lint issue
  • Loading branch information
subhashb authored Jul 17, 2023
1 parent dc81b35 commit f504265
Show file tree
Hide file tree
Showing 3 changed files with 328 additions and 1 deletion.
4 changes: 4 additions & 0 deletions lang/py/avro/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,7 @@ class UsageError(RuntimeError, AvroException):

class AvroRuntimeException(RuntimeError, AvroException):
"""Raised when compatibility parsing encounters an unknown type"""


class UnknownFingerprintAlgorithmException(AvroException):
"""Raised when attempting to generate a fingerprint with an unknown algorithm"""
81 changes: 80 additions & 1 deletion lang/py/avro/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,23 @@
import collections
import datetime
import decimal
import hashlib
import json
import math
import uuid
import warnings
from functools import reduce
from pathlib import Path
from typing import List, Mapping, MutableMapping, Optional, Sequence, Union, cast
from typing import (
FrozenSet,
List,
Mapping,
MutableMapping,
Optional,
Sequence,
Union,
cast,
)

import avro.constants
import avro.errors
Expand Down Expand Up @@ -104,6 +115,50 @@ def _is_timezone_aware_datetime(dt: datetime.datetime) -> bool:
return dt.tzinfo is not None and dt.tzinfo.utcoffset(dt) is not None


# Fingerprint Constants
_EMPTY64_FINGERPRINT: int = 0xC15D213AA4D7A795
_FINGERPRINT_TABLE: tuple = tuple(reduce(lambda fp, _: (fp >> 1) ^ (_EMPTY64_FINGERPRINT & -(fp & 1)), range(8), i) for i in range(256))


# All algorithms guaranteed by hashlib are supported:
# - 'blake2b',
# - 'blake2s',
# - 'md5',
# - 'sha1',
# - 'sha224',
# - 'sha256',
# - 'sha384',
# - 'sha3_224',
# - 'sha3_256',
# - 'sha3_384',
# - 'sha3_512',
# - 'sha512',
# - 'shake_128',
# - 'shake_256'
SUPPORTED_ALGORITHMS: FrozenSet[str] = frozenset({"CRC-64-AVRO"} | hashlib.algorithms_guaranteed)


def _crc_64_fingerprint(data: bytes) -> bytes:
"""The 64-bit Rabin Fingerprint.
As described in the Avro specification.
Args:
data: A bytes object containing the UTF-8 encoded parsing canonical
form of an Avro schema.
Returns:
A bytes object with a length of eight in little-endian format.
"""
result = _EMPTY64_FINGERPRINT

for b in data:
result = (result >> 8) ^ _FINGERPRINT_TABLE[(result ^ b) & 0xFF]

# Although not mentioned in the Avro specification, the Java
# implementation gives fingerprint bytes in little-endian order
return result.to_bytes(length=8, byteorder="little", signed=False)


#
# Base Classes
#
Expand Down Expand Up @@ -240,6 +295,30 @@ def __eq__(self, that: object) -> bool:
Consider the mixins EqualByPropsMixin and EqualByJsonMixin
"""

def fingerprint(self, algorithm="CRC-64-AVRO") -> bytes:
"""
Generate fingerprint for supplied algorithm.
'CRC-64-AVRO' will be used as the algorithm by default, but any
algorithm supported by hashlib (as can be referenced with
`hashlib.algorithms_guaranteed`) can be specified.
`algorithm` param is used as an algorithm name, and NoSuchAlgorithmException
will be thrown if the algorithm is not among supported.
"""
schema = self.canonical_form.encode("utf-8")

if algorithm == "CRC-64-AVRO":
return _crc_64_fingerprint(schema)

if algorithm not in SUPPORTED_ALGORITHMS:
raise avro.errors.UnknownFingerprintAlgorithmException(f"Unknown Fingerprint Algorithm: {algorithm}")

# Generate digests with hashlib for all other algorithms
# Lowercase algorithm to support algorithm strings sent by other languages like Java
h = hashlib.new(algorithm.lower(), schema)
return h.digest()


class NamedSchema(Schema):
"""Named Schemas specified in NAMED_TYPES."""
Expand Down
244 changes: 244 additions & 0 deletions lang/py/avro/test/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,204 @@ class InvalidTestSchema(TestSchema):
),
]


# Fingerprint examples are in the form of tuples:
# - Value in Position 0 is schema
# - Value in Position 1 is an array of fingerprints:
# - Position 0 is CRC-64-AVRO fingerprint
# - Position 0 is MD5 fingerprint
# - Position 0 is SHA256 fingerprint
FINGERPRINT_EXAMPLES = [
('"int"', ["8f5c393f1ad57572", "ef524ea1b91e73173d938ade36c1db32", "3f2b87a9fe7cc9b13835598c3981cd45e3e355309e5090aa0933d7becb6fba45"]),
('{"type": "int"}', ["8f5c393f1ad57572", "ef524ea1b91e73173d938ade36c1db32", "3f2b87a9fe7cc9b13835598c3981cd45e3e355309e5090aa0933d7becb6fba45"]),
('"float"', ["90d7a83ecb027c4d", "50a6b9db85da367a6d2df400a41758a6", "1e71f9ec051d663f56b0d8e1fc84d71aa56ccfe9fa93aa20d10547a7abeb5cc0"]),
(
'{"type": "float"}',
["90d7a83ecb027c4d", "50a6b9db85da367a6d2df400a41758a6", "1e71f9ec051d663f56b0d8e1fc84d71aa56ccfe9fa93aa20d10547a7abeb5cc0"],
),
('"long"', ["b71df49344e154d0", "e1dd9a1ef98b451b53690370b393966b", "c32c497df6730c97fa07362aa5023f37d49a027ec452360778114cf427965add"]),
(
'{"type": "long"}',
["b71df49344e154d0", "e1dd9a1ef98b451b53690370b393966b", "c32c497df6730c97fa07362aa5023f37d49a027ec452360778114cf427965add"],
),
('"double"', ["7e95ab32c035758e", "bfc71a62f38b99d6a93690deeb4b3af6", "730a9a8c611681d7eef442e03c16c70d13bca3eb8b977bb403eaff52176af254"]),
(
'{"type": "double"}',
["7e95ab32c035758e", "bfc71a62f38b99d6a93690deeb4b3af6", "730a9a8c611681d7eef442e03c16c70d13bca3eb8b977bb403eaff52176af254"],
),
('"bytes"', ["651920c3da16c04f", "b462f06cb909be57c85008867784cde6", "9ae507a9dd39ee5b7c7e285da2c0846521c8ae8d80feeae5504e0c981d53f5fa"]),
(
'{"type": "bytes"}',
["651920c3da16c04f", "b462f06cb909be57c85008867784cde6", "9ae507a9dd39ee5b7c7e285da2c0846521c8ae8d80feeae5504e0c981d53f5fa"],
),
('"string"', ["c70345637248018f", "095d71cf12556b9d5e330ad575b3df5d", "e9e5c1c9e4f6277339d1bcde0733a59bd42f8731f449da6dc13010a916930d48"]),
(
'{"type": "string"}',
["c70345637248018f", "095d71cf12556b9d5e330ad575b3df5d", "e9e5c1c9e4f6277339d1bcde0733a59bd42f8731f449da6dc13010a916930d48"],
),
('"boolean"', ["64f7d4a478fc429f", "01f692b30d4a1c8a3e600b1440637f8f", "a5b031ab62bc416d720c0410d802ea46b910c4fbe85c50a946ccc658b74e677e"]),
(
'{"type": "boolean"}',
["64f7d4a478fc429f", "01f692b30d4a1c8a3e600b1440637f8f", "a5b031ab62bc416d720c0410d802ea46b910c4fbe85c50a946ccc658b74e677e"],
),
('"null"', ["8a8f25cce724dd63", "9b41ef67651c18488a8b08bb67c75699", "f072cbec3bf8841871d4284230c5e983dc211a56837aed862487148f947d1a1f"]),
(
'{"type": "null"}',
["8a8f25cce724dd63", "9b41ef67651c18488a8b08bb67c75699", "f072cbec3bf8841871d4284230c5e983dc211a56837aed862487148f947d1a1f"],
),
(
'{"type": "fixed", "name": "Test", "size": 1}',
["6869897b4049355b", "db01bc515fcfcd2d4be82ed385288261", "f527116a6f44455697e935afc31dc60ad0f95caf35e1d9c9db62edb3ffeb9170"],
),
(
json.dumps({"type": "fixed", "name": "MyFixed", "namespace": "org.apache.hadoop.avro", "size": 1}),
["fadbd138e85bdf45", "d74b3726484422711c465d49e857b1ba", "28e493a44771cecc5deca4bd938cdc3d5a24cfe1f3760bc938fa1057df6334fc"],
),
(
'{"type": "enum", "name": "Test", "symbols": ["A", "B"]}',
["03a2f2c2e27f7a16", "d883f2a9b16ed085fcc5e4ca6c8f6ed1", "9b51286144f87ce5aebdc61ca834379effa5a41ce6ac0938630ff246297caca8"],
),
(
'{"type": "array", "items": "long"}',
["715e2ea28bc91654", "c1c387e8d6a58f0df749b698991b1f43", "f78e954167feb23dcb1ce01e8463cebf3408e0a4259e16f24bd38f6d0f1d578b"],
),
(
json.dumps({"type": "array", "items": {"type": "enum", "name": "Test", "symbols": ["A", "B"]}}),
["10d9ade1fa3a0387", "cfc7b861c7cfef082a6ef082948893fa", "0d8edd49d7f7e9553668f133577bc99f842852b55d9f84f1f7511e4961aa685c"],
),
(
'{"type": "map", "values": "long"}',
["6f74f4e409b1334e", "32b3f1a3177a0e73017920f00448b56e", "b8fad07d458971a07692206b8a7cf626c86c62fe6bcff7c1b11bc7295de34853"],
),
(
json.dumps({"type": "map", "values": {"type": "enum", "name": "Test", "symbols": ["A", "B"]}}),
["df2ab0626f6b812d", "c588da6ba99701c41e73fd30d23f994e", "3886747ed1669a8af476b549e97b34222afb2fed5f18bb27c6f367ea0351a576"],
),
(
'["string", "null", "long"]',
["65a5be410d687566", "b11cf95f0a55dd55f9ee515a37bf937a", "ed8d254116441bb35e237ad0563cf5432b8c975334bd222c1ee84609435d95bb"],
),
(
json.dumps({"type": "record", "name": "Test", "fields": [{"name": "f", "type": "long"}]}),
["ed94e5f5e6eb588e", "69531a03db788afe353244cd049b1e6d", "9670f15a8f96d23e92830d00b8bd57275e02e3e173ffef7c253c170b6beabeb8"],
),
(
json.dumps(
{
"type": "record",
"name": "Node",
"fields": [{"name": "label", "type": "string"}, {"name": "children", "type": {"type": "array", "items": "Node"}}],
}
),
["52cba544c3e756b7", "99625b0cc02050363e89ef66b0f406c9", "65d80dc8c95c98a9671d92cf0415edfabfee2cb058df2138606656cd6ae4dc59"],
),
(
json.dumps(
{
"type": "record",
"name": "Lisp",
"fields": [
{
"name": "value",
"type": [
"null",
"string",
{"type": "record", "name": "Cons", "fields": [{"name": "car", "type": "Lisp"}, {"name": "cdr", "type": "Lisp"}]},
],
}
],
}
),
["68d91a23eda0b306", "9e1d0d15b52789fcb8e3a88b53059d5f", "e5ce4f4a15ce19fa1047cfe16a3b0e13a755db40f00f23284fdd376fc1c7dd21"],
),
(
json.dumps(
{
"type": "record",
"name": "HandshakeRequest",
"namespace": "org.apache.avro.ipc",
"fields": [
{"name": "clientHash", "type": {"type": "fixed", "name": "MD5", "size": 16}},
{"name": "clientProtocol", "type": ["null", "string"]},
{"name": "serverHash", "type": "MD5"},
{"name": "meta", "type": ["null", {"type": "map", "values": "bytes"}]},
],
}
),
["43818703b7b5d769", "16ded8b5027e80a17704c6565c0c3f1b", "6c317314687da52a85c813a7f0c92298a60b79625b9acc072e4d9e4256a1d800"],
),
(
json.dumps(
{
"type": "record",
"name": "HandshakeResponse",
"namespace": "org.apache.avro.ipc",
"fields": [
{"name": "match", "type": {"type": "enum", "name": "HandshakeMatch", "symbols": ["BOTH", "CLIENT", "NONE"]}},
{"name": "serverProtocol", "type": ["null", "string"]},
{"name": "serverHash", "type": ["null", {"name": "MD5", "size": 16, "type": "fixed"}]},
{"name": "meta", "type": ["null", {"type": "map", "values": "bytes"}]},
],
}
),
["00feee01de4ea50e", "afe529d01132daab7f4e2a6663e7a2f5", "a303cbbfe13958f880605d70c521a4b7be34d9265ac5a848f25916a67b11d889"],
),
(
json.dumps(
{
"type": "record",
"name": "Interop",
"namespace": "org.apache.avro",
"fields": [
{"name": "intField", "type": "int"},
{"name": "longField", "type": "long"},
{"name": "stringField", "type": "string"},
{"name": "boolField", "type": "boolean"},
{"name": "floatField", "type": "float"},
{"name": "doubleField", "type": "double"},
{"name": "bytesField", "type": "bytes"},
{"name": "nullField", "type": "null"},
{"name": "arrayField", "type": {"type": "array", "items": "double"}},
{
"name": "mapField",
"type": {"type": "map", "values": {"name": "Foo", "type": "record", "fields": [{"name": "label", "type": "string"}]}},
},
{"name": "unionField", "type": ["boolean", "double", {"type": "array", "items": "bytes"}]},
{"name": "enumField", "type": {"type": "enum", "name": "Kind", "symbols": ["A", "B", "C"]}},
{"name": "fixedField", "type": {"type": "fixed", "name": "MD5", "size": 16}},
{
"name": "recordField",
"type": {
"type": "record",
"name": "Node",
"fields": [{"name": "label", "type": "string"}, {"name": "children", "type": {"type": "array", "items": "Node"}}],
},
},
],
}
),
["e82c0a93a6a0b5a4", "994fea1a1be7ff8603cbe40c3bc7e4ca", "cccfd6e3f917cf53b0f90c206342e6703b0d905071f724a1c1f85b731c74058d"],
),
(
json.dumps(
{
"type": "record",
"name": "ipAddr",
"fields": [{"name": "addr", "type": [{"name": "IPv6", "type": "fixed", "size": 16}, {"name": "IPv4", "type": "fixed", "size": 4}]}],
}
),
["8d961b4e298a1844", "45d85c69b353a99b93d7c4f2fcf0c30d", "6f6fc8f685a4f07d99734946565d63108806d55a8620febea047cf52cb0ac181"],
),
(
json.dumps({"type": "record", "name": "TestDoc", "doc": "Doc string", "fields": [{"name": "name", "type": "string", "doc": "Doc String"}]}),
["0e6660f02bcdc109", "f2da75f5131f5ab80629538287b8beb2", "0b3644f7aa5ca2fc4bad93ca2d3609c12aa9dbda9c15e68b34c120beff08e7b9"],
),
(
'{"type": "enum", "name": "Test", "symbols": ["A", "B"], "doc": "Doc String"}',
["03a2f2c2e27f7a16", "d883f2a9b16ed085fcc5e4ca6c8f6ed1", "9b51286144f87ce5aebdc61ca834379effa5a41ce6ac0938630ff246297caca8"],
),
]

EXAMPLES = PRIMITIVE_EXAMPLES
EXAMPLES += FIXED_EXAMPLES
EXAMPLES += ENUM_EXAMPLES
Expand Down Expand Up @@ -634,13 +832,25 @@ def test_fixed_decimal_invalid_max_precision(self):
def test_parse_invalid_symbol(self):
"""Disabling enumschema symbol validation should allow invalid symbols to pass."""
test_schema_string = json.dumps({"type": "enum", "name": "AVRO2174", "symbols": ["white space"]})

with self.assertRaises(avro.errors.InvalidName, msg="When enum symbol validation is enabled, an invalid symbol should raise InvalidName."):
avro.schema.parse(test_schema_string, validate_enum_symbols=True)

try:
avro.schema.parse(test_schema_string, validate_enum_symbols=False)
except avro.errors.InvalidName: # pragma: no coverage
self.fail("When enum symbol validation is disabled, an invalid symbol should not raise InvalidName.")

def test_unsupported_fingerprint_algorithm(self):
s = avro.schema.parse('"int"')
self.assertRaises(avro.errors.UnknownFingerprintAlgorithmException, s.fingerprint, "foo")

def test_less_popular_fingerprint_algorithm(self):
s = avro.schema.parse('"int"')
fingerprint = s.fingerprint("sha384")
hex_fingerprint = "".join(format(b, "02x") for b in fingerprint).zfill(16)
self.assertEqual(hex_fingerprint, "32ed5e4ac896570f044d1dab68f4c8ca9866ac06d22261f399316bf4799e16854750238085775107dfac905c82b2feaf")


class SchemaParseTestCase(unittest.TestCase):
"""Enable generating parse test cases over all the valid and invalid example schema."""
Expand Down Expand Up @@ -1181,6 +1391,39 @@ def test_large_record_interop(self):
)


class FingerprintTestCase(unittest.TestCase):
"""
Enable generating fingerprint test cases across algorithms.
Fingerprint examples are in the form of tuples:
- Value in Position 0 is schema
- Value in Position 1 is an array of fingerprints:
- Position 0 is CRC-64-AVRO fingerprint
- Position 0 is MD5 fingerprint
- Position 0 is SHA256 fingerprint
"""

def __init__(self, test_schema, fingerprints):
"""Ignore the normal signature for unittest.TestCase because we are generating
many test cases from this one class. This is safe as long as the autoloader
ignores this class. The autoloader will ignore this class as long as it has
no methods starting with `test_`.
"""
super(FingerprintTestCase, self).__init__("validate_fingerprint")
self.test_schema = test_schema
self.fingerprints = fingerprints

def _hex_fingerprint(self, fingerprint):
return "".join(format(b, "02x") for b in fingerprint).zfill(16)

def validate_fingerprint(self):
"""The string of a Schema should be parseable to the same Schema."""
s = avro.schema.parse(self.test_schema)
self.assertEqual(self._hex_fingerprint(s.fingerprint()), self.fingerprints[0])
self.assertEqual(self._hex_fingerprint(s.fingerprint("md5")), self.fingerprints[1])
self.assertEqual(self._hex_fingerprint(s.fingerprint("sha256")), self.fingerprints[2])


def load_tests(loader, default_tests, pattern):
"""Generate test cases across many test schema."""
suite = unittest.TestSuite()
Expand All @@ -1190,6 +1433,7 @@ def load_tests(loader, default_tests, pattern):
suite.addTests(DocAttributesTestCase(ex) for ex in DOC_EXAMPLES)
suite.addTests(OtherAttributesTestCase(ex) for ex in OTHER_PROP_EXAMPLES)
suite.addTests(loader.loadTestsFromTestCase(CanonicalFormTestCase))
suite.addTests(FingerprintTestCase(ex[0], ex[1]) for ex in FINGERPRINT_EXAMPLES)
return suite


Expand Down

0 comments on commit f504265

Please sign in to comment.