Skip to content

Commit

Permalink
Fix various bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
jeromekelleher committed Feb 7, 2024
1 parent f45089e commit e89e3fd
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 46 deletions.
2 changes: 1 addition & 1 deletion sgkit/accelerate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from numba import guvectorize, jit

_DISABLE_CACHE = os.environ.get("SGKIT_DISABLE_NUMBA_CACHE", "0")
_DISABLE_CACHE = os.environ.get("SGKIT_DISABLE_NUMBA_CACHE", "1")

try:
CACHE_NUMBA = {"0": True, "1": False}[_DISABLE_CACHE]
Expand Down
58 changes: 28 additions & 30 deletions sgkit/io/vcf/vcf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,11 +349,11 @@ def sanitise_value_float_2d(buff, j, value):


def sanitise_int_array(value, ndmin, dtype):
value = np.array(value, ndmin=ndmin, dtype=dtype, copy=False)
# FIXME
value[value == (MIN_INT_VALUE - 2)] = -1
value[value == (MIN_INT_VALUE - 1)] = -2
return value
value = np.array(value, ndmin=ndmin, copy=False)
value[value == VCF_INT_MISSING] = -1
value[value == VCF_INT_FILL] = -2
# TODO watch out for clipping here!
return value.astype(dtype)


def sanitise_value_int_1d(buff, j, value):
Expand Down Expand Up @@ -397,9 +397,6 @@ def num_chunks(self, partition_index):
self.partition_num_chunks[partition_index] = n
return self.partition_num_chunks[partition_index]

def is_numeric(self):
return self.vcf_field.vcf_type in ("Integer", "Float")

def __repr__(self):
# TODO add class name
return repr({"path": str(self.path), **self.vcf_field.summary.asdict()})
Expand Down Expand Up @@ -474,9 +471,6 @@ def sanitiser_factory(self, shape):
else:
return sanitise_value_string_2d

print(shape)

# return ret


def update_bounds_float(summary, value, number_dim):
Expand All @@ -493,15 +487,17 @@ def update_bounds_float(summary, value, number_dim):


MIN_INT_VALUE = np.iinfo(np.int32).min + 2
VCF_INT_MISSING = np.iinfo(np.int32).min
VCF_INT_FILL = np.iinfo(np.int32).min + 1


def update_bounds_integer(summary, value, number_dim):
# print("update bounds int", summary, value)
value = np.array(value, dtype=np.int32, copy=False)
# Mask out missing and fill values
value[value < MIN_INT_VALUE] = 0
summary.max_value = int(max(summary.max_value, np.max(value)))
summary.min_value = int(min(summary.min_value, np.min(value)))
a = value[value >= MIN_INT_VALUE]
summary.max_value = int(max(summary.max_value, np.max(a)))
summary.min_value = int(min(summary.min_value, np.min(a)))
number = 0
assert len(value.shape) <= number_dim + 1
if len(value.shape) == number_dim + 1:
Expand Down Expand Up @@ -857,6 +853,7 @@ class ZarrConversionSpec:
dimensions: list
sample_id: list
contig_id: list
contig_length: list
filter_id: list
variables: list

Expand Down Expand Up @@ -895,8 +892,8 @@ def fixed_field_spec(
compressor=compressor,
)

# FIXME
max_alleles = 4
alt_col = pcvcf.columns["ALT"]
max_alleles = alt_col.vcf_field.summary.max_number + 1
num_filters = len(pcvcf.metadata.filters)

# # FIXME get dtype from lookup table
Expand Down Expand Up @@ -1023,6 +1020,7 @@ def fixed_field_spec(
dimensions=["variants", "samples", "ploidy", "alleles", "filters"],
sample_id=pcvcf.metadata.samples,
contig_id=pcvcf.metadata.contig_names,
contig_length=pcvcf.metadata.contig_lengths,
filter_id=pcvcf.metadata.filters,
)

Expand Down Expand Up @@ -1064,7 +1062,6 @@ def encode_column(self, pcvcf, column):
ba = BufferedArray(array)
sanitiser = source_col.sanitiser_factory(ba.buff.shape)
chunk_length = array.chunks[0]
# num_variants = array.shape[0]

with cf.ThreadPoolExecutor(max_workers=4) as executor:
futures = []
Expand Down Expand Up @@ -1111,8 +1108,9 @@ def encode_genotypes(self, pcvcf):
for value, bytes_read in source_col.iter_values_bytes():
sanitise_value_int_2d(gt.buff, j, value[:, :-1])
sanitise_value_int_1d(gt_phased.buff, j, value[:, -1])
# FIXME set gt-mask
# gt_mask.buff[j: len(value) - 1] =
# TODO check is this the correct semantics when we are padding
# with mixed ploidies?
gt_mask.buff[j] = gt.buff[j] < 0

j += 1
if j == chunk_length:
Expand Down Expand Up @@ -1167,23 +1165,22 @@ def encode_samples(self, pcvcf, sample_id, chunk_width):
)
array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]

def encode_contig(self, pcvcf, contig_names):
def encode_contig(self, pcvcf, contig_names, contig_lengths):
array = self.root.array(
"contig_id",
contig_names,
dtype="str",
compressor=default_compressor,
)
array.attrs["_ARRAY_DIMENSIONS"] = ["contig"]
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]

# TODO add contig_lengths
# if pcvcf.metadata.contig_lengths is not None:
# full_array(
# "contig_length",
# pcvcf.metadata.contig_lengths,
# ["configs"],
# dtype=np.int64,
# )
if contig_lengths is not None:
array = self.root.array(
"contig_length",
contig_lengths,
dtype=np.int64,
)
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]

col = pcvcf.columns["CHROM"]
array = self.root["variant_contig"]
Expand Down Expand Up @@ -1288,7 +1285,8 @@ def convert(
),
executor.submit(sgvcf.encode_alleles, pcvcf),
executor.submit(sgvcf.encode_id, pcvcf),
executor.submit(sgvcf.encode_contig, pcvcf, conversion_spec.contig_id),
executor.submit(sgvcf.encode_contig, pcvcf, conversion_spec.contig_id,
conversion_spec.contig_length),
executor.submit(sgvcf.encode_filters, pcvcf, conversion_spec.filter_id),
]
has_gt = False
Expand Down
23 changes: 8 additions & 15 deletions sgkit/tests/io/vcf/test_vcf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,6 @@ def test_vcf_to_zarr__small_vcf(
path, chunk_length=5, chunk_width=2, fields=fields, field_defs=field_defs
)

print(ds)
print(ds.variant_NS)

assert_array_equal(ds["filter_id"], ["PASS", "s50", "q10"])
assert_array_equal(
ds["variant_filter"],
Expand Down Expand Up @@ -132,10 +129,6 @@ def test_vcf_to_zarr__small_vcf(
)
assert ds["variant_NS"].chunks[0][0] == 5

# assert_array_equal(
# ds["variant_AN"],
# [-2, -2, -2, -2, -2, -2, 6, -2, -2],
# )
assert_array_equal(
ds["variant_AN"],
[-1, -1, -1, -1, -1, -1, 6, -1, -1],
Expand Down Expand Up @@ -277,19 +270,19 @@ def test_vcf_to_zarr__small_vcf(
[[58, 50], [65, 3], [-1, -1]],
[[23, 27], [18, 2], [-1, -1]],
[[56, 60], [51, 51], [-1, -1]],
[[-2, -2], [-2, -2], [-2, -2]],
[[-2, -2], [-2, -2], [-2, -2]],
[[-2, -2], [-2, -2], [-2, -2]],
[[-1, -1], [-1, -1], [-1, -1]],
[[-1, -1], [-1, -1], [-1, -1]],
[[-1, -1], [-1, -1], [-1, -1]],
]

# print(np.array2string(ds["call_genotype_mask"].values, separator=","))
# print(np.array2string(ds["call_HQ"].values, separator=","))
# print(np.array2string(ds["call_genotype"].values < 0, separator=","))

assert_array_equal(ds["call_genotype"], call_genotype)
print("FIXME")
# assert_array_equal(ds["call_genotype_mask"], call_genotype < 0)
assert_array_equal(ds["call_genotype_mask"], call_genotype < 0)
assert_array_equal(ds["call_genotype_phased"], call_genotype_phased)
# assert_array_equal(ds["call_DP"], call_DP)
# assert_array_equal(ds["call_HQ"], call_HQ)
assert_array_equal(ds["call_DP"], call_DP)
assert_array_equal(ds["call_HQ"], call_HQ)

for name in ["call_genotype", "call_genotype_mask", "call_HQ"]:
assert ds[name].chunks == ((5, 4), (2, 1), (2,))
Expand Down

0 comments on commit e89e3fd

Please sign in to comment.