Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add initial f16 and f128 support to the aarch64 backend #9076

Merged
merged 1 commit into from
Aug 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cranelift/codegen/meta/src/isa/arm64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ pub(crate) fn define() -> TargetIsa {
"",
false,
);
settings.add_bool(
"has_fp16",
"Use half-precision floating point (FEAT_FP16) instructions.",
"",
false,
);
settings.add_bool(
"sign_return_address_all",
"If function return address signing is enabled, then apply it to all \
Expand Down
9 changes: 8 additions & 1 deletion cranelift/codegen/src/isa/aarch64/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ impl ABIMachineSpec for AArch64MachineDeps {

fn compute_arg_locs(
call_conv: isa::CallConv,
_flags: &settings::Flags,
flags: &settings::Flags,
params: &[ir::AbiParam],
args_or_rets: ArgsOrRets,
add_ret_area_ptr: bool,
Expand Down Expand Up @@ -161,6 +161,13 @@ impl ABIMachineSpec for AArch64MachineDeps {
param.value_type
);

if is_apple_cc && param.value_type == types::F128 && !flags.enable_llvm_abi_extensions()
{
panic!(
"f128 args/return values not supported for apple_aarch64 unless LLVM ABI extensions are enabled"
);
}

let (rcs, reg_types) = Inst::rc_for_type(param.value_type)?;

if matches!(
Expand Down
80 changes: 74 additions & 6 deletions cranelift/codegen/src/isa/aarch64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,18 @@
(rn Reg)
(rm Reg))

;; Floating-point load, half-precision (16 bit).
(FpuLoad16
(rd WritableReg)
(mem AMode)
(flags MemFlags))

;; Floating-point store, half-precision (16 bit).
(FpuStore16
(rd Reg)
(mem AMode)
(flags MemFlags))

;; Floating-point load, single-precision (32 bit).
(FpuLoad32
(rd WritableReg)
Expand Down Expand Up @@ -483,6 +495,14 @@
(rd WritableReg)
(rn Reg))

;; FP conditional select, 16 bit.
;; Requires FEAT_FP16.
(FpuCSel16
(rd WritableReg)
(rn Reg)
(rm Reg)
(cond Cond))

;; FP conditional select, 32 bit.
(FpuCSel32
(rd WritableReg)
Expand All @@ -504,8 +524,8 @@
(rn Reg))

;; Move from a GPR to a vector register. The scalar value is parked in the lowest lane
;; of the destination, and all other lanes are zeroed out. Currently only 32- and 64-bit
;; transactions are supported.
;; of the destination, and all other lanes are zeroed out. Currently 16-, 32- and 64-bit
;; transactions are supported. 16-bit moves require FEAT_FP16.
(MovToFpu
(rd WritableReg)
(rn Reg)
Expand Down Expand Up @@ -1701,6 +1721,9 @@
(decl use_lse () Inst)
(extern extractor use_lse use_lse)

(decl pure use_fp16 () bool)
(extern constructor use_fp16 use_fp16)

;; Extractor helpers for various immediate constants ;;;;;;;;;;;;;;;;;;;;;;;;;;

(decl pure partial move_wide_const_from_u64 (Type u64) MoveWideConst)
Expand Down Expand Up @@ -2221,9 +2244,19 @@
(_ Unit (emit (MInst.VecRRLong op dst src high_half))))
dst))

;; Helper for emitting `MInst.FpuCSel32` / `MInst.FpuCSel64`
;; Helper for emitting `MInst.FpuCSel16` / `MInst.FpuCSel32` / `MInst.FpuCSel64`
;; instructions.
(decl fpu_csel (Type Cond Reg Reg) ConsumesFlags)
(rule (fpu_csel $F16 cond if_true if_false)
(fpu_csel $F32 cond if_true if_false))

(rule 1 (fpu_csel $F16 cond if_true if_false)
(if-let $true (use_fp16))
(let ((dst WritableReg (temp_writable_reg $F16)))
(ConsumesFlags.ConsumesFlagsReturnsReg
(MInst.FpuCSel16 dst if_true if_false cond)
dst)))

(rule (fpu_csel $F32 cond if_true if_false)
(let ((dst WritableReg (temp_writable_reg $F32)))
(ConsumesFlags.ConsumesFlagsReturnsReg
Expand Down Expand Up @@ -2268,6 +2301,9 @@
(let ((dst WritableReg (temp_writable_reg $I8X16))
(_ Unit (emit (MInst.MovToFpu dst x size))))
dst))
(rule 1 (mov_to_fpu x (ScalarSize.Size16))
(if-let $false (use_fp16))
(mov_to_fpu x (ScalarSize.Size32)))

;; Helper for emitting `MInst.FpuMoveFPImm` instructions.
(decl fpu_move_fp_imm (ASIMDFPModImm ScalarSize) Reg)
Expand Down Expand Up @@ -2849,6 +2885,11 @@
(let ((dst WritableReg (temp_writable_reg $I64))
(_ Unit (emit (MInst.ULoad64 dst amode flags))))
dst))
(decl aarch64_fpuload16 (AMode MemFlags) Reg)
(rule (aarch64_fpuload16 amode flags)
(let ((dst WritableReg (temp_writable_reg $F64))
(_ Unit (emit (MInst.FpuLoad16 dst amode flags))))
dst))
(decl aarch64_fpuload32 (AMode MemFlags) Reg)
(rule (aarch64_fpuload32 amode flags)
(let ((dst WritableReg (temp_writable_reg $F64))
Expand Down Expand Up @@ -2885,6 +2926,9 @@
(decl aarch64_store64 (AMode MemFlags Reg) SideEffectNoResult)
(rule (aarch64_store64 amode flags val)
(SideEffectNoResult.Inst (MInst.Store64 val amode flags)))
(decl aarch64_fpustore16 (AMode MemFlags Reg) SideEffectNoResult)
(rule (aarch64_fpustore16 amode flags val)
(SideEffectNoResult.Inst (MInst.FpuStore16 val amode flags)))
(decl aarch64_fpustore32 (AMode MemFlags Reg) SideEffectNoResult)
(rule (aarch64_fpustore32 amode flags val)
(SideEffectNoResult.Inst (MInst.FpuStore32 val amode flags)))
Expand Down Expand Up @@ -3229,19 +3273,41 @@
(rule 1 (add_imm_to_addr val (imm12_from_u64 imm)) (add_imm $I64 val imm))
(rule 0 (add_imm_to_addr val offset) (add $I64 val (imm $I64 (ImmExtend.Zero) offset)))

;; Lower a constant f16.
;;
;; Note that we must make sure that all bits outside the lowest 16 are set to 0
;; because this function is also used to load wider constants (that have zeros
;; in their most significant bits).
(decl constant_f16 (u16) Reg)
(rule 3 (constant_f16 n)
(if-let $false (use_fp16))
(constant_f32 n))
(rule 2 (constant_f16 0)
(vec_dup_imm (asimd_mov_mod_imm_zero (ScalarSize.Size32))
$false
(VectorSize.Size32x2)))
(rule 1 (constant_f16 n)
(if-let imm (asimd_fp_mod_imm_from_u64 n (ScalarSize.Size16)))
(fpu_move_fp_imm imm (ScalarSize.Size16)))
(rule (constant_f16 n)
(mov_to_fpu (imm $I16 (ImmExtend.Zero) n) (ScalarSize.Size16)))

;; Lower a constant f32.
;;
;; Note that we must make sure that all bits outside the lowest 32 are set to 0
;; because this function is also used to load wider constants (that have zeros
;; in their most significant bits).
(decl constant_f32 (u32) Reg)
(rule 2 (constant_f32 0)
(rule 3 (constant_f32 0)
(vec_dup_imm (asimd_mov_mod_imm_zero (ScalarSize.Size32))
$false
(VectorSize.Size32x2)))
(rule 1 (constant_f32 n)
(rule 2 (constant_f32 n)
(if-let imm (asimd_fp_mod_imm_from_u64 n (ScalarSize.Size32)))
(fpu_move_fp_imm imm (ScalarSize.Size32)))
(rule 1 (constant_f32 (u32_as_u16 n))
(if-let $true (use_fp16))
(constant_f16 n))
(rule (constant_f32 n)
(mov_to_fpu (imm $I32 (ImmExtend.Zero) n) (ScalarSize.Size32)))

Expand Down Expand Up @@ -4063,8 +4129,10 @@

;; Helpers for generating select instruction sequences.
(decl lower_select (ProducesFlags Cond Type Value Value) ValueRegs)
(rule 2 (lower_select flags cond (ty_scalar_float ty) rn rm)
(rule 2 (lower_select flags cond (ty_scalar_float (fits_in_64 ty)) rn rm)
(with_flags flags (fpu_csel ty cond rn rm)))
(rule 4 (lower_select flags cond $F128 rn rm)
(with_flags flags (vec_csel cond rn rm)))
(rule 3 (lower_select flags cond (ty_vec128 ty) rn rm)
(with_flags flags (vec_csel cond rn rm)))
(rule (lower_select flags cond ty rn rm)
Expand Down
15 changes: 9 additions & 6 deletions cranelift/codegen/src/isa/aarch64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -958,6 +958,7 @@ impl MachInstEmit for Inst {
| &Inst::ULoad64 {
rd, ref mem, flags, ..
}
| &Inst::FpuLoad16 { rd, ref mem, flags }
| &Inst::FpuLoad32 { rd, ref mem, flags }
| &Inst::FpuLoad64 { rd, ref mem, flags }
| &Inst::FpuLoad128 { rd, ref mem, flags } => {
Expand All @@ -983,6 +984,7 @@ impl MachInstEmit for Inst {
Inst::ULoad32 { .. } => 0b1011100001,
Inst::SLoad32 { .. } => 0b1011100010,
Inst::ULoad64 { .. } => 0b1111100001,
Inst::FpuLoad16 { .. } => 0b0111110001,
Inst::FpuLoad32 { .. } => 0b1011110001,
Inst::FpuLoad64 { .. } => 0b1111110001,
Inst::FpuLoad128 { .. } => 0b0011110011,
Expand Down Expand Up @@ -1098,6 +1100,7 @@ impl MachInstEmit for Inst {
| &Inst::Store16 { rd, ref mem, flags }
| &Inst::Store32 { rd, ref mem, flags }
| &Inst::Store64 { rd, ref mem, flags }
| &Inst::FpuStore16 { rd, ref mem, flags }
| &Inst::FpuStore32 { rd, ref mem, flags }
| &Inst::FpuStore64 { rd, ref mem, flags }
| &Inst::FpuStore128 { rd, ref mem, flags } => {
Expand All @@ -1114,6 +1117,7 @@ impl MachInstEmit for Inst {
Inst::Store16 { .. } => 0b0111100000,
Inst::Store32 { .. } => 0b1011100000,
Inst::Store64 { .. } => 0b1111100000,
Inst::FpuStore16 { .. } => 0b0111110000,
Inst::FpuStore32 { .. } => 0b1011110000,
Inst::FpuStore64 { .. } => 0b1111110000,
Inst::FpuStore128 { .. } => 0b0011110010,
Expand Down Expand Up @@ -2213,6 +2217,9 @@ impl MachInstEmit for Inst {
};
sink.put4(enc_inttofpu(top16, rd, rn));
}
&Inst::FpuCSel16 { rd, rn, rm, cond } => {
sink.put4(enc_fcsel(rd, rn, rm, cond, ScalarSize::Size16));
}
&Inst::FpuCSel32 { rd, rn, rm, cond } => {
sink.put4(enc_fcsel(rd, rn, rm, cond, ScalarSize::Size32));
}
Expand All @@ -2234,21 +2241,17 @@ impl MachInstEmit for Inst {
}
&Inst::MovToFpu { rd, rn, size } => {
let template = match size {
ScalarSize::Size16 => 0b000_11110_11_1_00_111_000000_00000_00000,
ScalarSize::Size32 => 0b000_11110_00_1_00_111_000000_00000_00000,
ScalarSize::Size64 => 0b100_11110_01_1_00_111_000000_00000_00000,
_ => unreachable!(),
};
sink.put4(template | (machreg_to_gpr(rn) << 5) | machreg_to_vec(rd.to_reg()));
}
&Inst::FpuMoveFPImm { rd, imm, size } => {
let size_code = match size {
ScalarSize::Size32 => 0b00,
ScalarSize::Size64 => 0b01,
_ => unimplemented!(),
};
sink.put4(
0b000_11110_00_1_00_000_000100_00000_00000
| size_code << 22
| size.ftype() << 22
| ((imm.enc_bits() as u32) << 13)
| machreg_to_vec(rd.to_reg()),
);
Expand Down
37 changes: 37 additions & 0 deletions cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6699,6 +6699,19 @@ fn test_aarch64_binemit() {
"fcmp d23, d24",
));

insns.push((
Inst::FpuLoad16 {
rd: writable_vreg(16),
mem: AMode::RegScaled {
rn: xreg(8),
rm: xreg(9),
},
flags: MemFlags::trusted(),
},
"1079697C",
"ldr h16, [x8, x9, LSL #1]",
));

insns.push((
Inst::FpuLoad32 {
rd: writable_vreg(16),
Expand Down Expand Up @@ -6774,6 +6787,19 @@ fn test_aarch64_binemit() {
"ldr q16, pc+8",
));

insns.push((
Inst::FpuStore16 {
rd: vreg(16),
mem: AMode::RegScaled {
rn: xreg(8),
rm: xreg(9),
},
flags: MemFlags::trusted(),
},
"1079297C",
"str h16, [x8, x9, LSL #1]",
));

insns.push((
Inst::FpuStore32 {
rd: vreg(16),
Expand Down Expand Up @@ -6973,6 +6999,17 @@ fn test_aarch64_binemit() {
"stp q18, q22, [sp], #304",
));

insns.push((
Inst::FpuCSel16 {
rd: writable_vreg(1),
rn: vreg(2),
rm: vreg(3),
cond: Cond::Hi,
},
"418CE31E",
"fcsel h1, h2, h3, hi",
));

insns.push((
Inst::FpuCSel32 {
rd: writable_vreg(1),
Expand Down
Loading
Loading