Skip to content

Commit

Permalink
Fixes psy0rz#225 zfs-check: efficient handling of sparse files
Browse files Browse the repository at this point in the history
BlockHasher.py
* hash_class is sourced from cli args instead of hardcoding it.
* hash_factory() lays the groundwork to support arbitrary hash libs.
* Detection of and use of xxhash lib.

ZfsCheck.py
* Implement new cli arg --hash. The choices for the arg are generated
based on what is detected in the python env.
* The input to --hash is is validated against the arg choices.
* Implemented helper method determine_algorithms_available(). This tries
to pick a performant default with a fallback to sha1.
* Detection of and use of xxhash lib.
  • Loading branch information
kyle0r committed Nov 22, 2023
1 parent 7122dc9 commit bf2300b
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 5 deletions.
18 changes: 14 additions & 4 deletions zfs_autobackup/BlockHasher.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import hashlib
import os

xxhash = None
try:
import xxhash
except:
pass

class BlockHasher():
"""This class was created to checksum huge files and blockdevices (TB's)
Expand All @@ -16,7 +21,7 @@ class BlockHasher():
"""

def __init__(self, count=10000, bs=4096, hash_class=hashlib.sha1, skip=0):
def __init__(self, count=10000, bs=4096, hash_class=None, skip=0):
self.count = count
self.bs = bs
self.chunk_size=bs*count
Expand All @@ -28,6 +33,11 @@ def __init__(self, count=10000, bs=4096, hash_class=hashlib.sha1, skip=0):

self.stats_total_bytes=0

def hash_factory(self):
if self.hash_class in hashlib.algorithms_available:
return hashlib.new(self.hash_class)
if self.hash_class.startswith('xxh'):
return getattr(xxhash, self.hash_class)()

def _seek_next_chunk(self, fh, fsize):
"""seek fh to next chunk and update skip counter.
Expand Down Expand Up @@ -80,7 +90,7 @@ def generate(self, fname):
return

#read chunk
hash = self.hash_class()
hash = self.hash_factory()
block_nr = 0
while block_nr != self.count:
block=fh.read(self.bs)
Expand All @@ -105,7 +115,7 @@ def compare(self, fname, generator):
try:

checked = checked + 1
hash = self.hash_class()
hash = self.hash_factory()
f.seek(int(chunk_nr) * self.bs * self.count)
block_nr = 0
for block in iter(lambda: f.read(self.bs), b""):
Expand All @@ -124,4 +134,4 @@ def compare(self, fname, generator):
yield ( chunk_nr , hexdigest, 'ERROR: '+str(e))

except Exception as e:
yield ( '-', '-', 'ERROR: '+ str(e))
yield ( '-', '-', 'ERROR: '+ str(e))
22 changes: 21 additions & 1 deletion zfs_autobackup/ZfsCheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@
from .util import *
from .CliBase import CliBase

from hashlib import algorithms_available
from copy import copy
xxhash = None
try:
import xxhash
except:
pass

class ZfsCheck(CliBase):

Expand All @@ -20,7 +27,17 @@ def __init__(self, argv, print_arguments=True):

self.node = ZfsNode(self.log, utc=self.args.utc, readonly=self.args.test, debug_output=self.args.debug_output)

self.block_hasher = BlockHasher(count=self.args.count, bs=self.args.block_size, skip=self.args.skip)
self.block_hasher = BlockHasher(count=self.args.count, bs=self.args.block_size, skip=self.args.skip, hash_class=self.args.hash)

def determine_algorithms_available(self):
self.algorithms_available = copy(algorithms_available)

if None != xxhash:
for value in ( 'xxh128', 'xxh32', 'xxh3_128', 'xxh3_64', 'xxh64' ):
self.algorithms_available.add(value)
self.hash_default = 'xxh3_64'
else:
self.hash_default = 'sha1'

def get_parser(self):

Expand All @@ -42,6 +59,9 @@ def get_parser(self):
group.add_argument('--skip', '-s', metavar="NUMBER", default=0, type=int,
help="Skip this number of chunks after every hash. %(default)s")

self.determine_algorithms_available()
group.add_argument('--hash', default=self.hash_default,
help="Specify the hashing algorithm to use", choices=sorted([item for item in self.algorithms_available]))
return parser

def parse_args(self, argv):
Expand Down

0 comments on commit bf2300b

Please sign in to comment.