Fix various bugs

jeromekelleher · Feb 7, 2024 · e89e3fd · e89e3fd
1 parent f45089e
commit e89e3fd
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 46 deletions.
diff --git a/sgkit/accelerate.py b/sgkit/accelerate.py
@@ -3,7 +3,7 @@
 
 from numba import guvectorize, jit
 
-_DISABLE_CACHE = os.environ.get("SGKIT_DISABLE_NUMBA_CACHE", "0")
+_DISABLE_CACHE = os.environ.get("SGKIT_DISABLE_NUMBA_CACHE", "1")
 
 try:
     CACHE_NUMBA = {"0": True, "1": False}[_DISABLE_CACHE]

diff --git a/sgkit/io/vcf/vcf_converter.py b/sgkit/io/vcf/vcf_converter.py
@@ -349,11 +349,11 @@ def sanitise_value_float_2d(buff, j, value):
 
 
 def sanitise_int_array(value, ndmin, dtype):
-    value = np.array(value, ndmin=ndmin, dtype=dtype, copy=False)
-    # FIXME
-    value[value == (MIN_INT_VALUE - 2)] = -1
-    value[value == (MIN_INT_VALUE - 1)] = -2
-    return value
+    value = np.array(value, ndmin=ndmin, copy=False)
+    value[value == VCF_INT_MISSING] = -1
+    value[value == VCF_INT_FILL] = -2
+    # TODO watch out for clipping here!
+    return value.astype(dtype)
 
 
 def sanitise_value_int_1d(buff, j, value):
@@ -397,9 +397,6 @@ def num_chunks(self, partition_index):
             self.partition_num_chunks[partition_index] = n
         return self.partition_num_chunks[partition_index]
 
-    def is_numeric(self):
-        return self.vcf_field.vcf_type in ("Integer", "Float")
-
     def __repr__(self):
         # TODO add class name
         return repr({"path": str(self.path), **self.vcf_field.summary.asdict()})
@@ -474,9 +471,6 @@ def sanitiser_factory(self, shape):
             else:
                 return sanitise_value_string_2d
 
-            print(shape)
-
-        # return ret
 
 
 def update_bounds_float(summary, value, number_dim):
@@ -493,15 +487,17 @@ def update_bounds_float(summary, value, number_dim):
 
 
 MIN_INT_VALUE = np.iinfo(np.int32).min + 2
+VCF_INT_MISSING = np.iinfo(np.int32).min
+VCF_INT_FILL = np.iinfo(np.int32).min + 1
 
 
 def update_bounds_integer(summary, value, number_dim):
     # print("update bounds int", summary, value)
     value = np.array(value, dtype=np.int32, copy=False)
     # Mask out missing and fill values
-    value[value < MIN_INT_VALUE] = 0
-    summary.max_value = int(max(summary.max_value, np.max(value)))
-    summary.min_value = int(min(summary.min_value, np.min(value)))
+    a = value[value >= MIN_INT_VALUE]
+    summary.max_value = int(max(summary.max_value, np.max(a)))
+    summary.min_value = int(min(summary.min_value, np.min(a)))
     number = 0
     assert len(value.shape) <= number_dim + 1
     if len(value.shape) == number_dim + 1:
@@ -857,6 +853,7 @@ class ZarrConversionSpec:
     dimensions: list
     sample_id: list
     contig_id: list
+    contig_length: list
     filter_id: list
     variables: list
 
@@ -895,8 +892,8 @@ def fixed_field_spec(
                 compressor=compressor,
             )
 
-        # FIXME
-        max_alleles = 4
+        alt_col = pcvcf.columns["ALT"]
+        max_alleles = alt_col.vcf_field.summary.max_number + 1
         num_filters = len(pcvcf.metadata.filters)
 
         # # FIXME get dtype from lookup table
@@ -1023,6 +1020,7 @@ def fixed_field_spec(
             dimensions=["variants", "samples", "ploidy", "alleles", "filters"],
             sample_id=pcvcf.metadata.samples,
             contig_id=pcvcf.metadata.contig_names,
+            contig_length=pcvcf.metadata.contig_lengths,
             filter_id=pcvcf.metadata.filters,
         )
 
@@ -1064,7 +1062,6 @@ def encode_column(self, pcvcf, column):
         ba = BufferedArray(array)
         sanitiser = source_col.sanitiser_factory(ba.buff.shape)
         chunk_length = array.chunks[0]
-        # num_variants = array.shape[0]
 
         with cf.ThreadPoolExecutor(max_workers=4) as executor:
             futures = []
@@ -1111,8 +1108,9 @@ def encode_genotypes(self, pcvcf):
             for value, bytes_read in source_col.iter_values_bytes():
                 sanitise_value_int_2d(gt.buff, j, value[:, :-1])
                 sanitise_value_int_1d(gt_phased.buff, j, value[:, -1])
-                # FIXME set gt-mask
-                # gt_mask.buff[j: len(value) - 1] =
+                # TODO check is this the correct semantics when we are padding
+                # with mixed ploidies?
+                gt_mask.buff[j] = gt.buff[j] < 0
 
                 j += 1
                 if j == chunk_length:
@@ -1167,23 +1165,22 @@ def encode_samples(self, pcvcf, sample_id, chunk_width):
         )
         array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
 
-    def encode_contig(self, pcvcf, contig_names):
+    def encode_contig(self, pcvcf, contig_names, contig_lengths):
         array = self.root.array(
             "contig_id",
             contig_names,
             dtype="str",
             compressor=default_compressor,
         )
-        array.attrs["_ARRAY_DIMENSIONS"] = ["contig"]
+        array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
 
-        # TODO add contig_lengths
-        # if pcvcf.metadata.contig_lengths is not None:
-        #     full_array(
-        #         "contig_length",
-        #         pcvcf.metadata.contig_lengths,
-        #         ["configs"],
-        #         dtype=np.int64,
-        #     )
+        if contig_lengths is not None:
+            array = self.root.array(
+                "contig_length",
+                contig_lengths,
+                dtype=np.int64,
+            )
+            array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
 
         col = pcvcf.columns["CHROM"]
         array = self.root["variant_contig"]
@@ -1288,7 +1285,8 @@ def convert(
                 ),
                 executor.submit(sgvcf.encode_alleles, pcvcf),
                 executor.submit(sgvcf.encode_id, pcvcf),
-                executor.submit(sgvcf.encode_contig, pcvcf, conversion_spec.contig_id),
+                executor.submit(sgvcf.encode_contig, pcvcf, conversion_spec.contig_id,
+                    conversion_spec.contig_length),
                 executor.submit(sgvcf.encode_filters, pcvcf, conversion_spec.filter_id),
             ]
             has_gt = False

diff --git a/sgkit/tests/io/vcf/test_vcf_reader.py b/sgkit/tests/io/vcf/test_vcf_reader.py
@@ -92,9 +92,6 @@ def test_vcf_to_zarr__small_vcf(
             path, chunk_length=5, chunk_width=2, fields=fields, field_defs=field_defs
         )
 
-    print(ds)
-    print(ds.variant_NS)
-
     assert_array_equal(ds["filter_id"], ["PASS", "s50", "q10"])
     assert_array_equal(
         ds["variant_filter"],
@@ -132,10 +129,6 @@ def test_vcf_to_zarr__small_vcf(
     )
     assert ds["variant_NS"].chunks[0][0] == 5
 
-    # assert_array_equal(
-    #     ds["variant_AN"],
-    #     [-2, -2, -2, -2, -2, -2, 6, -2, -2],
-    # )
     assert_array_equal(
         ds["variant_AN"],
         [-1, -1, -1, -1, -1, -1, 6, -1, -1],
@@ -277,19 +270,19 @@ def test_vcf_to_zarr__small_vcf(
         [[58, 50], [65, 3], [-1, -1]],
         [[23, 27], [18, 2], [-1, -1]],
         [[56, 60], [51, 51], [-1, -1]],
-        [[-2, -2], [-2, -2], [-2, -2]],
-        [[-2, -2], [-2, -2], [-2, -2]],
-        [[-2, -2], [-2, -2], [-2, -2]],
+        [[-1, -1], [-1, -1], [-1, -1]],
+        [[-1, -1], [-1, -1], [-1, -1]],
+        [[-1, -1], [-1, -1], [-1, -1]],
     ]
 
-    # print(np.array2string(ds["call_genotype_mask"].values, separator=","))
+    # print(np.array2string(ds["call_HQ"].values, separator=","))
+    # print(np.array2string(ds["call_genotype"].values < 0, separator=","))
 
     assert_array_equal(ds["call_genotype"], call_genotype)
-    print("FIXME")
-    # assert_array_equal(ds["call_genotype_mask"], call_genotype < 0)
+    assert_array_equal(ds["call_genotype_mask"], call_genotype < 0)
     assert_array_equal(ds["call_genotype_phased"], call_genotype_phased)
-    # assert_array_equal(ds["call_DP"], call_DP)
-    # assert_array_equal(ds["call_HQ"], call_HQ)
+    assert_array_equal(ds["call_DP"], call_DP)
+    assert_array_equal(ds["call_HQ"], call_HQ)
 
     for name in ["call_genotype", "call_genotype_mask", "call_HQ"]:
         assert ds[name].chunks == ((5, 4), (2, 1), (2,))