Skip to content

Commit

Permalink
Improve atomic_rmw lowering on x86 (#9495)
Browse files Browse the repository at this point in the history
  • Loading branch information
beetrees authored Oct 23, 2024
1 parent d19610e commit f6a9612
Show file tree
Hide file tree
Showing 13 changed files with 3,965 additions and 612 deletions.
123 changes: 104 additions & 19 deletions cranelift/codegen/src/isa/x64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
(AluRM (size OperandSize) ;; 1, 2, 4 or 8
(op AluRmiROpcode)
(src1_dst SyntheticAmode)
(src2 Gpr))
(src2 Gpr)
(lock bool))

;; Integer arithmetic binary op that relies on the VEX prefix.
;; NOTE: we don't currently support emitting VEX instructions with memory
Expand Down Expand Up @@ -682,6 +683,18 @@
(dst_old_low WritableReg)
(dst_old_high WritableReg))

;; A standard (native) `lock xadd src, (amode)`
(LockXadd (size OperandSize)
(operand Reg)
(mem SyntheticAmode)
(dst_old WritableReg))

;; A standard (native) `xchg src, (amode)`
(Xchg (size OperandSize)
(operand Reg)
(mem SyntheticAmode)
(dst_old WritableReg))

;; A synthetic instruction, based on a loop around a native `lock
;; cmpxchg` instruction.
;;
Expand All @@ -708,7 +721,7 @@
;; - %rflags is written. Do not assume anything about it after the
;; instruction.
(AtomicRmwSeq (ty Type) ;; I8, I16, I32, or I64
(op MachAtomicRmwOp)
(op AtomicRmwSeqOp)
(mem SyntheticAmode)
(operand Reg)
(temp WritableReg)
Expand All @@ -719,15 +732,15 @@
;;
;; This is the same as `AtomicRmwSeq`, but for 128-bit integers.
;;
;; For `MachAtomicRmwOp::Xchg`, use `Atomic128XchgSeq` instead.
;; For `AtomicRmwOp::Xchg`, use `Atomic128XchgSeq` instead.
;;
;; This instruction sequence has fixed register uses as follows:
;; - %rax (low), %rdx (high) (written) the old value at `mem`
;; - %rbx (low), %rcx (high) (written) used as temp registers to hold
;; the replacement value
;; - %rflags is written. Do not assume anything about it after the
;; instruction.
(Atomic128RmwSeq (op MachAtomicRmwOp)
(Atomic128RmwSeq (op Atomic128RmwSeqOp)
(mem BoxSyntheticAmode)
(operand_low Reg)
(operand_high Reg)
Expand All @@ -739,8 +752,8 @@
;; A synthetic instruction, based on a loop around a native `lock
;; cmpxchg16b` instruction.
;;
;; This is `Atomic128XchgSeq` but only for `MachAtomicRmwOp::Xchg`. As
;; the replacement value is the same every time, this instruction doesn't
;; This is `Atomic128XchgSeq` but only for `AtomicRmwOp::Xchg`. As the
;; replacement value is the same every time, this instruction doesn't
;; require any temporary registers.
;;
;; This instruction sequence has fixed register uses as follows:
Expand Down Expand Up @@ -4899,7 +4912,7 @@
(decl alu_rm (Type AluRmiROpcode Amode Gpr) SideEffectNoResult)
(rule (alu_rm ty opcode src1_dst src2)
(let ((size OperandSize (operand_size_of_type_32_64 ty)))
(SideEffectNoResult.Inst (MInst.AluRM size opcode src1_dst src2))))
(SideEffectNoResult.Inst (MInst.AluRM size opcode src1_dst src2 $false))))

(decl x64_add_mem (Type Amode Gpr) SideEffectNoResult)
(spec (x64_add_mem ty addr val)
Expand Down Expand Up @@ -5291,25 +5304,62 @@
(_ Unit (emit (MInst.LockCmpxchg16b replacement_low replacement_high expected_low expected_high addr dst_low dst_high))))
(value_regs dst_low dst_high)))

(decl x64_atomic_rmw_seq (Type MachAtomicRmwOp SyntheticAmode Gpr) Gpr)
(decl x64_xadd (OperandSize SyntheticAmode Gpr) Gpr)
(rule (x64_xadd size addr operand)
(let ((dst WritableGpr (temp_writable_gpr))
(_ Unit (emit (MInst.LockXadd size operand addr dst))))
dst))

(decl x64_xchg (OperandSize SyntheticAmode Gpr) Gpr)
(rule (x64_xchg size addr operand)
(let ((dst WritableGpr (temp_writable_gpr))
(_ Unit (emit (MInst.Xchg size operand addr dst))))
dst))

(decl lock_alu_rm (OperandSize AluRmiROpcode SyntheticAmode Gpr) Reg)
(rule (lock_alu_rm size opcode addr operand)
(let ((_ Unit (emit (MInst.AluRM size opcode addr operand $true))))
(invalid_reg)))

(decl x64_lock_add (OperandSize SyntheticAmode Gpr) Reg)
(rule (x64_lock_add size addr operand)
(lock_alu_rm size (AluRmiROpcode.Add) addr operand))

(decl x64_lock_sub (OperandSize SyntheticAmode Gpr) Reg)
(rule (x64_lock_sub size addr operand)
(lock_alu_rm size (AluRmiROpcode.Sub) addr operand))

(decl x64_lock_and (OperandSize SyntheticAmode Gpr) Reg)
(rule (x64_lock_and size addr operand)
(lock_alu_rm size (AluRmiROpcode.And) addr operand))

(decl x64_lock_or (OperandSize SyntheticAmode Gpr) Reg)
(rule (x64_lock_or size addr operand)
(lock_alu_rm size (AluRmiROpcode.Or) addr operand))

(decl x64_lock_xor (OperandSize SyntheticAmode Gpr) Reg)
(rule (x64_lock_xor size addr operand)
(lock_alu_rm size (AluRmiROpcode.Xor) addr operand))

(decl x64_atomic_rmw_seq (Type AtomicRmwSeqOp SyntheticAmode Gpr) Gpr)
(rule (x64_atomic_rmw_seq ty op mem input)
(let ((dst WritableGpr (temp_writable_gpr))
(tmp WritableGpr (temp_writable_gpr))
(_ Unit (emit (MInst.AtomicRmwSeq ty op mem input tmp dst))))
dst))

(decl x64_atomic_128_rmw_seq (MachAtomicRmwOp SyntheticAmode ValueRegs) ValueRegs)
(decl x64_atomic_128_rmw_seq (AtomicRmwOp SyntheticAmode ValueRegs) ValueRegs)
(rule (x64_atomic_128_rmw_seq op mem input)
(let ((dst_low WritableGpr (temp_writable_gpr))
(dst_high WritableGpr (temp_writable_gpr))
(tmp_low WritableGpr (temp_writable_gpr))
(tmp_high WritableGpr (temp_writable_gpr))
(input_low Gpr (value_regs_get_gpr input 0))
(input_high Gpr (value_regs_get_gpr input 1))
(_ Unit (emit (MInst.Atomic128RmwSeq op mem input_low input_high tmp_low tmp_high dst_low dst_high))))
(_ Unit (emit (MInst.Atomic128RmwSeq (atomic_128_rmw_seq_op op) mem input_low input_high tmp_low tmp_high dst_low dst_high))))
(value_regs dst_low dst_high)))

(rule 1 (x64_atomic_128_rmw_seq (mach_atomic_rmw_op_xchg) mem input)
(rule 1 (x64_atomic_128_rmw_seq (AtomicRmwOp.Xchg) mem input)
(let ((dst_low WritableGpr (temp_writable_gpr))
(dst_high WritableGpr (temp_writable_gpr))
(input_low Gpr (value_regs_get_gpr input 0))
Expand All @@ -5325,14 +5375,50 @@
(input_high Gpr (value_regs_get_gpr input 1)))
(SideEffectNoResult.Inst (MInst.Atomic128XchgSeq mem input_low input_high dst_low dst_high))))

(decl mach_atomic_rmw_op_xchg () MachAtomicRmwOp)
(extern extractor mach_atomic_rmw_op_xchg mach_atomic_rmw_op_is_xchg)

;; CLIF IR has one enumeration for atomic operations (`AtomicRmwOp`) while the
;; mach backend has another (`MachAtomicRmwOp`)--this converts one to the other.
(type MachAtomicRmwOp extern (enum))
(decl atomic_rmw_op_to_mach_atomic_rmw_op (AtomicRmwOp) MachAtomicRmwOp)
(extern constructor atomic_rmw_op_to_mach_atomic_rmw_op atomic_rmw_op_to_mach_atomic_rmw_op)
(type AtomicRmwSeqOp
(enum And
Nand
Or
Xor
Umin
Umax
Smin
Smax))

(decl atomic_rmw_seq_op (AtomicRmwOp) AtomicRmwSeqOp)
(rule (atomic_rmw_seq_op (AtomicRmwOp.And)) (AtomicRmwSeqOp.And))
(rule (atomic_rmw_seq_op (AtomicRmwOp.Nand)) (AtomicRmwSeqOp.Nand))
(rule (atomic_rmw_seq_op (AtomicRmwOp.Or)) (AtomicRmwSeqOp.Or))
(rule (atomic_rmw_seq_op (AtomicRmwOp.Xor)) (AtomicRmwSeqOp.Xor))
(rule (atomic_rmw_seq_op (AtomicRmwOp.Umin)) (AtomicRmwSeqOp.Umin))
(rule (atomic_rmw_seq_op (AtomicRmwOp.Umax)) (AtomicRmwSeqOp.Umax))
(rule (atomic_rmw_seq_op (AtomicRmwOp.Smin)) (AtomicRmwSeqOp.Smin))
(rule (atomic_rmw_seq_op (AtomicRmwOp.Smax)) (AtomicRmwSeqOp.Smax))

(type Atomic128RmwSeqOp
(enum Add
Sub
And
Nand
Or
Xor
Umin
Umax
Smin
Smax))

(decl atomic_128_rmw_seq_op (AtomicRmwOp) Atomic128RmwSeqOp)
(rule (atomic_128_rmw_seq_op (AtomicRmwOp.Add)) (Atomic128RmwSeqOp.Add))
(rule (atomic_128_rmw_seq_op (AtomicRmwOp.Sub)) (Atomic128RmwSeqOp.Sub))
(rule (atomic_128_rmw_seq_op (AtomicRmwOp.And)) (Atomic128RmwSeqOp.And))
(rule (atomic_128_rmw_seq_op (AtomicRmwOp.Nand)) (Atomic128RmwSeqOp.Nand))
(rule (atomic_128_rmw_seq_op (AtomicRmwOp.Or)) (Atomic128RmwSeqOp.Or))
(rule (atomic_128_rmw_seq_op (AtomicRmwOp.Xor)) (Atomic128RmwSeqOp.Xor))
(rule (atomic_128_rmw_seq_op (AtomicRmwOp.Umin)) (Atomic128RmwSeqOp.Umin))
(rule (atomic_128_rmw_seq_op (AtomicRmwOp.Umax)) (Atomic128RmwSeqOp.Umax))
(rule (atomic_128_rmw_seq_op (AtomicRmwOp.Smin)) (Atomic128RmwSeqOp.Smin))
(rule (atomic_128_rmw_seq_op (AtomicRmwOp.Smax)) (Atomic128RmwSeqOp.Smax))

;;;; Casting ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

Expand Down Expand Up @@ -5559,7 +5645,6 @@
(convert VCodeConstant RegMem const_to_reg_mem)

(convert IntCC CC intcc_to_cc)
(convert AtomicRmwOp MachAtomicRmwOp atomic_rmw_op_to_mach_atomic_rmw_op)

(convert SinkableLoad RegMem sink_load_to_reg_mem)
(convert SinkableLoad GprMem sink_load_to_gpr_mem)
Expand Down
89 changes: 52 additions & 37 deletions cranelift/codegen/src/isa/x64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use crate::isa::x64::encoding::rex::{
use crate::isa::x64::encoding::vex::{VexInstruction, VexVectorLength};
use crate::isa::x64::inst::args::*;
use crate::isa::x64::inst::*;
use crate::isa::x64::lower::isle::generated_code::{Atomic128RmwSeqOp, AtomicRmwSeqOp};

/// A small helper to generate a signed conversion instruction.
fn emit_signed_cvt(
Expand Down Expand Up @@ -255,6 +256,7 @@ pub(crate) fn emit(
src1_dst,
src2,
op,
lock,
} => {
let src2 = src2.to_reg();
let src1_dst = src1_dst.finalize(state, sink).clone();
Expand All @@ -268,10 +270,11 @@ pub(crate) fn emit(
_ => panic!("Unsupported read-modify-write ALU opcode"),
};

let prefix = if *size == OperandSize::Size16 {
LegacyPrefixes::_66
} else {
LegacyPrefixes::None
let prefix = match (size, lock) {
(OperandSize::Size16, false) => LegacyPrefixes::_66,
(OperandSize::Size16, true) => LegacyPrefixes::_66F0,
(_, false) => LegacyPrefixes::None,
(_, true) => LegacyPrefixes::_F0,
};
let opcode = if *size == OperandSize::Size8 {
opcode - 1
Expand Down Expand Up @@ -4070,6 +4073,45 @@ pub(crate) fn emit(
);
}

Inst::LockXadd {
size,
operand,
mem,
dst_old,
} => {
debug_assert_eq!(dst_old.to_reg(), *operand);
// lock xadd{b,w,l,q} %operand, (mem)
// Note that 0xF0 is the Lock prefix.
let (prefix, opcodes) = match size {
OperandSize::Size8 => (LegacyPrefixes::_F0, 0x0FC0),
OperandSize::Size16 => (LegacyPrefixes::_66F0, 0x0FC1),
OperandSize::Size32 => (LegacyPrefixes::_F0, 0x0FC1),
OperandSize::Size64 => (LegacyPrefixes::_F0, 0x0FC1),
};
let rex = RexFlags::from((*size, *operand));
let amode = mem.finalize(state, sink);
emit_std_reg_mem(sink, prefix, opcodes, 2, *operand, &amode, rex, 0);
}

Inst::Xchg {
size,
operand,
mem,
dst_old,
} => {
debug_assert_eq!(dst_old.to_reg(), *operand);
// xchg{b,w,l,q} %operand, (mem)
let (prefix, opcodes) = match size {
OperandSize::Size8 => (LegacyPrefixes::None, 0x86),
OperandSize::Size16 => (LegacyPrefixes::_66, 0x87),
OperandSize::Size32 => (LegacyPrefixes::None, 0x87),
OperandSize::Size64 => (LegacyPrefixes::None, 0x87),
};
let rex = RexFlags::from((*size, *operand));
let amode = mem.finalize(state, sink);
emit_std_reg_mem(sink, prefix, opcodes, 1, *operand, &amode, rex, 0);
}

Inst::AtomicRmwSeq {
ty,
op,
Expand All @@ -4094,15 +4136,6 @@ pub(crate) fn emit(
//
// Operand conventions: IN: %r_address, %r_operand OUT: %rax (old
// value), %r_temp (trashed), %rflags (trashed)
//
// In the case where the operation is 'xchg', the "`op`q"
// instruction is instead: movq %r_operand,
// %r_temp so that we simply write in the destination, the "2nd
// arg for `op`".
//
// TODO: this sequence can be significantly improved (e.g., to `lock
// <op>`) when it is known that `dst_old` is not used later, see
// https://github.com/bytecodealliance/wasmtime/issues/2153.
let again_label = sink.get_label();

// mov{zbq,zwq,zlq,q} (%r_address), %rax
Expand All @@ -4118,13 +4151,8 @@ pub(crate) fn emit(
i2.emit(sink, info, state);

let operand_rmi = RegMemImm::reg(operand);
use inst_common::MachAtomicRmwOp as RmwOp;
use AtomicRmwSeqOp as RmwOp;
match op {
RmwOp::Xchg => {
// movq %r_operand, %r_temp
let i3 = Inst::mov_r_r(OperandSize::Size64, operand, temp);
i3.emit(sink, info, state);
}
RmwOp::Nand => {
// andq %r_operand, %r_temp
let i3 =
Expand Down Expand Up @@ -4155,20 +4183,13 @@ pub(crate) fn emit(
let i4 = Inst::cmove(OperandSize::Size64, cc, RegMem::reg(operand), temp);
i4.emit(sink, info, state);
}
_ => {
RmwOp::And | RmwOp::Or | RmwOp::Xor => {
// opq %r_operand, %r_temp
let alu_op = match op {
RmwOp::Add => AluRmiROpcode::Add,
RmwOp::Sub => AluRmiROpcode::Sub,
RmwOp::And => AluRmiROpcode::And,
RmwOp::Or => AluRmiROpcode::Or,
RmwOp::Xor => AluRmiROpcode::Xor,
RmwOp::Xchg
| RmwOp::Nand
| RmwOp::Umin
| RmwOp::Umax
| RmwOp::Smin
| RmwOp::Smax => unreachable!(),
_ => unreachable!(),
};
let i3 = Inst::alu_rmi_r(OperandSize::Size64, alu_op, operand_rmi, temp);
i3.emit(sink, info, state);
Expand Down Expand Up @@ -4232,9 +4253,8 @@ pub(crate) fn emit(
// Perform the operation.
let operand_low_rmi = RegMemImm::reg(operand_low);
let operand_high_rmi = RegMemImm::reg(operand_high);
use inst_common::MachAtomicRmwOp as RmwOp;
use Atomic128RmwSeqOp as RmwOp;
match op {
RmwOp::Xchg => panic!("use `Atomic128XchgSeq` instead"),
RmwOp::Nand => {
// temp &= operand
Inst::alu_rmi_r(
Expand Down Expand Up @@ -4284,20 +4304,15 @@ pub(crate) fn emit(
Inst::cmove(OperandSize::Size64, cc, operand_high.into(), temp_high)
.emit(sink, info, state);
}
_ => {
RmwOp::Add | RmwOp::Sub | RmwOp::And | RmwOp::Or | RmwOp::Xor => {
// temp op= operand
let (op_low, op_high) = match op {
RmwOp::Add => (AluRmiROpcode::Add, AluRmiROpcode::Adc),
RmwOp::Sub => (AluRmiROpcode::Sub, AluRmiROpcode::Sbb),
RmwOp::And => (AluRmiROpcode::And, AluRmiROpcode::And),
RmwOp::Or => (AluRmiROpcode::Or, AluRmiROpcode::Or),
RmwOp::Xor => (AluRmiROpcode::Xor, AluRmiROpcode::Xor),
RmwOp::Xchg
| RmwOp::Nand
| RmwOp::Umin
| RmwOp::Umax
| RmwOp::Smin
| RmwOp::Smax => unreachable!(),
_ => unreachable!(),
};
Inst::alu_rmi_r(OperandSize::Size64, op_low, operand_low_rmi, temp_low)
.emit(sink, info, state);
Expand Down
Loading

0 comments on commit f6a9612

Please sign in to comment.