Skip to content

Commit

Permalink
x64: Improve lowerings for ctz/clz instructions (#8673)
Browse files Browse the repository at this point in the history
* x64: Add some more tests for `ctz`/`clz`

* x64: Improve lowerings for i8/i16/i128 `ctz` and `clz` intructions
  • Loading branch information
afonso360 authored May 21, 2024
1 parent 55909a3 commit 3eae74d
Show file tree
Hide file tree
Showing 9 changed files with 599 additions and 26 deletions.
52 changes: 27 additions & 25 deletions cranelift/codegen/src/isa/x64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -2129,21 +2129,14 @@

;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; If available, we can use a plain lzcnt instruction here. Note no
;; special handling is required for zero inputs, because the machine
;; instruction does what the CLIF expects for zero, i.e. it returns
;; zero.
(rule 3 (lower (has_type (ty_32_or_64 ty) (clz src)))
(if-let $true (use_lzcnt))
(x64_lzcnt ty src))

(rule 2 (lower (has_type (ty_32_or_64 ty) (clz src)))
(do_clz ty ty src))

(rule 1 (lower
(has_type (ty_8_or_16 ty)
(clz src)))
(do_clz $I32 ty (extend_to_gpr src $I32 (ExtendKind.Zero))))
(rule 1 (lower (has_type (ty_8_or_16 ty) (clz src)))
(let ((extended Gpr (extend_to_gpr src $I64 (ExtendKind.Zero)))
(clz Gpr (do_clz $I64 $I64 extended)))
(x64_sub $I64 clz (RegMemImm.Imm (u32_sub 64 (ty_bits ty))))))


(rule 0 (lower
(has_type $I128
Expand All @@ -2160,27 +2153,29 @@

;; Implementation helper for clz; operates on 32 or 64-bit units.
(decl do_clz (Type Type Gpr) Gpr)
(rule (do_clz ty orig_ty src)

;; If available, we can use a plain lzcnt instruction here. Note no
;; special handling is required for zero inputs, because the machine
;; instruction does what the CLIF expects for zero, i.e. it returns
;; zero.
(rule 1 (do_clz ty orig_ty src)
(if-let $true (use_lzcnt))
(x64_lzcnt ty src))

(rule 0 (do_clz ty orig_ty src)
(let ((highest_bit_index Reg (bsr_or_else ty src (imm_i64 $I64 -1)))
(bits_minus_1 Reg (imm ty (u64_sub (ty_bits_u64 orig_ty) 1))))
(x64_sub ty bits_minus_1 highest_bit_index)))

;; Rules for `ctz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Analogous to `clz` cases above, but using mirror instructions
;; (tzcnt vs lzcnt, bsf vs bsr).

(rule 3 (lower (has_type (ty_32_or_64 ty) (ctz src)))
(if-let $true (use_bmi1))
(x64_tzcnt ty src))

(rule 2 (lower (has_type (ty_32_or_64 ty) (ctz src)))
(do_ctz ty ty src))

(rule 1 (lower
(has_type (ty_8_or_16 ty)
(ctz src)))
(do_ctz $I32 ty (extend_to_gpr src $I32 (ExtendKind.Zero))))
(rule 1 (lower (has_type (ty_8_or_16 ty) (ctz src)))
(let ((extended Gpr (extend_to_gpr src $I32 (ExtendKind.Zero)))
(stopbit Gpr (x64_or $I32 extended (RegMemImm.Imm (u32_shl 1 (ty_bits ty))))))
(do_ctz $I32 ty stopbit)))

(rule 0 (lower
(has_type $I128
Expand All @@ -2196,7 +2191,14 @@
(value_regs result_lo (imm $I64 0))))

(decl do_ctz (Type Type Gpr) Gpr)
(rule (do_ctz ty orig_ty src)

;; Analogous to `clz` cases above, but using mirror instructions
;; (tzcnt vs lzcnt, bsf vs bsr).
(rule 1 (do_ctz ty orig_ty src)
(if-let $true (use_bmi1))
(x64_tzcnt ty src))

(rule 0 (do_ctz ty orig_ty src)
(bsf_or_else ty src (imm $I64 (ty_bits_u64 orig_ty))))

;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
Expand Down
5 changes: 5 additions & 0 deletions cranelift/codegen/src/isle_prelude.rs
Original file line number Diff line number Diff line change
Expand Up @@ -683,6 +683,11 @@ macro_rules! isle_common_prelude_methods {
a & b
}

#[inline]
fn u32_shl(&mut self, x: u32, y: u32) -> u32 {
x << y
}

#[inline]
fn s32_add_fallible(&mut self, a: i32, b: i32) -> Option<i32> {
a.checked_add(b)
Expand Down
3 changes: 3 additions & 0 deletions cranelift/codegen/src/prelude.isle
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,9 @@
(decl pure u32_and (u32 u32) u32)
(extern constructor u32_and u32_and)

(decl pure u32_shl (u32 u32) u32)
(extern constructor u32_shl u32_shl)

;; Pure/fallible constructor that tries to add two `u32`s, interpreted
;; as signed values, and fails to match on overflow.
(decl pure partial s32_add_fallible (i32 i32) i32)
Expand Down
95 changes: 95 additions & 0 deletions cranelift/filetests/filetests/isa/x64/clz-lzcnt.clif
Original file line number Diff line number Diff line change
@@ -1,6 +1,43 @@
test compile precise-output
set enable_llvm_abi_extensions=true
target x86_64 has_lzcnt


function %clz(i128) -> i128 {
block0(v0: i128):
v1 = clz v0
return v1
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; lzcntq %rsi, %rcx
; lzcntq %rdi, %rax
; addq %rax, $64, %rax
; cmpq $64, %rcx
; cmovnzq %rcx, %rax, %rax
; xorq %rdx, %rdx, %rdx
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; lzcntq %rsi, %rcx
; lzcntq %rdi, %rax
; addq $0x40, %rax
; cmpq $0x40, %rcx
; cmovneq %rcx, %rax
; xorq %rdx, %rdx
; movq %rbp, %rsp
; popq %rbp
; retq

function %clz(i64) -> i64 {
block0(v0: i64):
v1 = clz v0
Expand Down Expand Up @@ -51,3 +88,61 @@ block0(v0: i32):
; popq %rbp
; retq

function %clz(i16) -> i16 {
block0(v0: i16):
v1 = clz v0
return v1
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movzwq %di, %rax
; lzcntq %rax, %rax
; subq %rax, $48, %rax
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movzwq %di, %rax
; lzcntq %rax, %rax
; subq $0x30, %rax
; movq %rbp, %rsp
; popq %rbp
; retq

function %clz(i8) -> i8 {
block0(v0: i8):
v1 = clz v0
return v1
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movzbq %dil, %rax
; lzcntq %rax, %rax
; subq %rax, $56, %rax
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movzbq %dil, %rax
; lzcntq %rax, %rax
; subq $0x38, %rax
; movq %rbp, %rsp
; popq %rbp
; retq

Loading

0 comments on commit 3eae74d

Please sign in to comment.