bytecodealliance · cfallin · Aug 16, 2024 · Aug 4, 2024
@@ -18,6 +18,12 @@ pub(crate) fn define() -> TargetIsa {
         "",
         false,
     );
+    settings.add_bool(
+        "has_fp16",
+        "Use half-precision floating point (FEAT_FP16) instructions.",
+        "",
+        false,
+    );
     settings.add_bool(
         "sign_return_address_all",
         "If function return address signing is enabled, then apply it to all \

@@ -102,7 +102,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
 
     fn compute_arg_locs(
         call_conv: isa::CallConv,
-        _flags: &settings::Flags,
+        flags: &settings::Flags,
         params: &[ir::AbiParam],
         args_or_rets: ArgsOrRets,
         add_ret_area_ptr: bool,
@@ -161,6 +161,13 @@ impl ABIMachineSpec for AArch64MachineDeps {
                 param.value_type
             );
 
+            if is_apple_cc && param.value_type == types::F128 && !flags.enable_llvm_abi_extensions()
+            {
+                panic!(
+                    "f128 args/return values not supported for apple_aarch64 unless LLVM ABI extensions are enabled"
+                );
+            }
+
             let (rcs, reg_types) = Inst::rc_for_type(param.value_type)?;
 
             if matches!(

@@ -407,6 +407,18 @@
         (rn Reg)
         (rm Reg))
 
+       ;; Floating-point load, half-precision (16 bit).
+       (FpuLoad16
+        (rd WritableReg)
+        (mem AMode)
+        (flags MemFlags))
+
+       ;; Floating-point store, half-precision (16 bit).
+       (FpuStore16
+        (rd Reg)
+        (mem AMode)
+        (flags MemFlags))
+
        ;; Floating-point load, single-precision (32 bit).
        (FpuLoad32
         (rd WritableReg)
@@ -483,6 +495,14 @@
         (rd WritableReg)
         (rn Reg))
 
+       ;; FP conditional select, 16 bit.
+       ;; Requires FEAT_FP16.
+       (FpuCSel16
+        (rd WritableReg)
+        (rn Reg)
+        (rm Reg)
+        (cond Cond))
+
        ;; FP conditional select, 32 bit.
        (FpuCSel32
         (rd WritableReg)
@@ -504,8 +524,8 @@
         (rn Reg))
 
        ;; Move from a GPR to a vector register.  The scalar value is parked in the lowest lane
-       ;; of the destination, and all other lanes are zeroed out.  Currently only 32- and 64-bit
-       ;; transactions are supported.
+       ;; of the destination, and all other lanes are zeroed out. Currently 16-, 32- and 64-bit
+       ;; transactions are supported. 16-bit moves require FEAT_FP16.
        (MovToFpu
         (rd WritableReg)
         (rn Reg)
@@ -1701,6 +1721,9 @@
 (decl use_lse () Inst)
 (extern extractor use_lse use_lse)
 
+(decl pure use_fp16 () bool)
+(extern constructor use_fp16 use_fp16)
+
 ;; Extractor helpers for various immediate constants ;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (decl pure partial move_wide_const_from_u64 (Type u64) MoveWideConst)
@@ -2221,9 +2244,19 @@
             (_ Unit (emit (MInst.VecRRLong op dst src high_half))))
         dst))
 
-;; Helper for emitting `MInst.FpuCSel32` / `MInst.FpuCSel64`
+;; Helper for emitting `MInst.FpuCSel16` / `MInst.FpuCSel32` / `MInst.FpuCSel64`
 ;; instructions.
 (decl fpu_csel (Type Cond Reg Reg) ConsumesFlags)
+(rule (fpu_csel $F16 cond if_true if_false)
+        (fpu_csel $F32 cond if_true if_false))
+
+(rule 1 (fpu_csel $F16 cond if_true if_false)
+        (if-let $true (use_fp16))
+        (let ((dst WritableReg (temp_writable_reg $F16)))
+          (ConsumesFlags.ConsumesFlagsReturnsReg
+           (MInst.FpuCSel16 dst if_true if_false cond)
+           dst)))
+
 (rule (fpu_csel $F32 cond if_true if_false)
       (let ((dst WritableReg (temp_writable_reg $F32)))
         (ConsumesFlags.ConsumesFlagsReturnsReg
@@ -2268,6 +2301,9 @@
       (let ((dst WritableReg (temp_writable_reg $I8X16))
             (_ Unit (emit (MInst.MovToFpu dst x size))))
         dst))
+(rule 1 (mov_to_fpu x (ScalarSize.Size16))
+        (if-let $false (use_fp16))
+        (mov_to_fpu x (ScalarSize.Size32)))
 
 ;; Helper for emitting `MInst.FpuMoveFPImm` instructions.
 (decl fpu_move_fp_imm (ASIMDFPModImm ScalarSize) Reg)
@@ -2849,6 +2885,11 @@
       (let ((dst WritableReg (temp_writable_reg $I64))
             (_ Unit (emit (MInst.ULoad64 dst amode flags))))
         dst))
+(decl aarch64_fpuload16 (AMode MemFlags) Reg)
+(rule (aarch64_fpuload16 amode flags)
+      (let ((dst WritableReg (temp_writable_reg $F64))
+            (_ Unit (emit (MInst.FpuLoad16 dst amode flags))))
+        dst))
 (decl aarch64_fpuload32 (AMode MemFlags) Reg)
 (rule (aarch64_fpuload32 amode flags)
       (let ((dst WritableReg (temp_writable_reg $F64))
@@ -2885,6 +2926,9 @@
 (decl aarch64_store64 (AMode MemFlags Reg) SideEffectNoResult)
 (rule (aarch64_store64 amode flags val)
       (SideEffectNoResult.Inst (MInst.Store64 val amode flags)))
+(decl aarch64_fpustore16 (AMode MemFlags Reg) SideEffectNoResult)
+(rule (aarch64_fpustore16 amode flags val)
+      (SideEffectNoResult.Inst (MInst.FpuStore16 val amode flags)))
 (decl aarch64_fpustore32 (AMode MemFlags Reg) SideEffectNoResult)
 (rule (aarch64_fpustore32 amode flags val)
       (SideEffectNoResult.Inst (MInst.FpuStore32 val amode flags)))
@@ -3229,19 +3273,41 @@
 (rule 1 (add_imm_to_addr val (imm12_from_u64 imm)) (add_imm $I64 val imm))
 (rule 0 (add_imm_to_addr val offset) (add $I64 val (imm $I64 (ImmExtend.Zero) offset)))
 
+;; Lower a constant f16.
+;;
+;; Note that we must make sure that all bits outside the lowest 16 are set to 0
+;; because this function is also used to load wider constants (that have zeros
+;; in their most significant bits).
+(decl constant_f16 (u16) Reg)
+(rule 3 (constant_f16 n)
+        (if-let $false (use_fp16))
+        (constant_f32 n))
+(rule 2 (constant_f16 0)
+        (vec_dup_imm (asimd_mov_mod_imm_zero (ScalarSize.Size32))
+                     $false
+                     (VectorSize.Size32x2)))
+(rule 1 (constant_f16 n)
+        (if-let imm (asimd_fp_mod_imm_from_u64 n (ScalarSize.Size16)))
+        (fpu_move_fp_imm imm (ScalarSize.Size16)))
+(rule (constant_f16 n)
+      (mov_to_fpu (imm $I16 (ImmExtend.Zero) n) (ScalarSize.Size16)))
+
 ;; Lower a constant f32.
 ;;
 ;; Note that we must make sure that all bits outside the lowest 32 are set to 0
 ;; because this function is also used to load wider constants (that have zeros
 ;; in their most significant bits).
 (decl constant_f32 (u32) Reg)
-(rule 2 (constant_f32 0)
+(rule 3 (constant_f32 0)
         (vec_dup_imm (asimd_mov_mod_imm_zero (ScalarSize.Size32))
                      $false
                      (VectorSize.Size32x2)))
-(rule 1 (constant_f32 n)
+(rule 2 (constant_f32 n)
         (if-let imm (asimd_fp_mod_imm_from_u64 n (ScalarSize.Size32)))
         (fpu_move_fp_imm imm (ScalarSize.Size32)))
+(rule 1 (constant_f32 (u32_as_u16 n))
+        (if-let $true (use_fp16))
+        (constant_f16 n))
 (rule (constant_f32 n)
       (mov_to_fpu (imm $I32 (ImmExtend.Zero) n) (ScalarSize.Size32)))
 
@@ -4063,8 +4129,10 @@
 
 ;; Helpers for generating select instruction sequences.
 (decl lower_select (ProducesFlags Cond Type Value Value) ValueRegs)
-(rule 2 (lower_select flags cond (ty_scalar_float ty) rn rm)
+(rule 2 (lower_select flags cond (ty_scalar_float (fits_in_64 ty)) rn rm)
       (with_flags flags (fpu_csel ty cond rn rm)))
+(rule 4 (lower_select flags cond $F128 rn rm)
+      (with_flags flags (vec_csel cond rn rm)))
 (rule 3 (lower_select flags cond (ty_vec128 ty) rn rm)
       (with_flags flags (vec_csel cond rn rm)))
 (rule (lower_select flags cond ty rn rm)

@@ -958,6 +958,7 @@ impl MachInstEmit for Inst {
             | &Inst::ULoad64 {
                 rd, ref mem, flags, ..
             }
+            | &Inst::FpuLoad16 { rd, ref mem, flags }
             | &Inst::FpuLoad32 { rd, ref mem, flags }
             | &Inst::FpuLoad64 { rd, ref mem, flags }
             | &Inst::FpuLoad128 { rd, ref mem, flags } => {
@@ -983,6 +984,7 @@ impl MachInstEmit for Inst {
                     Inst::ULoad32 { .. } => 0b1011100001,
                     Inst::SLoad32 { .. } => 0b1011100010,
                     Inst::ULoad64 { .. } => 0b1111100001,
+                    Inst::FpuLoad16 { .. } => 0b0111110001,
                     Inst::FpuLoad32 { .. } => 0b1011110001,
                     Inst::FpuLoad64 { .. } => 0b1111110001,
                     Inst::FpuLoad128 { .. } => 0b0011110011,
@@ -1098,6 +1100,7 @@ impl MachInstEmit for Inst {
             | &Inst::Store16 { rd, ref mem, flags }
             | &Inst::Store32 { rd, ref mem, flags }
             | &Inst::Store64 { rd, ref mem, flags }
+            | &Inst::FpuStore16 { rd, ref mem, flags }
             | &Inst::FpuStore32 { rd, ref mem, flags }
             | &Inst::FpuStore64 { rd, ref mem, flags }
             | &Inst::FpuStore128 { rd, ref mem, flags } => {
@@ -1114,6 +1117,7 @@ impl MachInstEmit for Inst {
                     Inst::Store16 { .. } => 0b0111100000,
                     Inst::Store32 { .. } => 0b1011100000,
                     Inst::Store64 { .. } => 0b1111100000,
+                    Inst::FpuStore16 { .. } => 0b0111110000,
                     Inst::FpuStore32 { .. } => 0b1011110000,
                     Inst::FpuStore64 { .. } => 0b1111110000,
                     Inst::FpuStore128 { .. } => 0b0011110010,
@@ -2213,6 +2217,9 @@ impl MachInstEmit for Inst {
                 };
                 sink.put4(enc_inttofpu(top16, rd, rn));
             }
+            &Inst::FpuCSel16 { rd, rn, rm, cond } => {
+                sink.put4(enc_fcsel(rd, rn, rm, cond, ScalarSize::Size16));
+            }
             &Inst::FpuCSel32 { rd, rn, rm, cond } => {
                 sink.put4(enc_fcsel(rd, rn, rm, cond, ScalarSize::Size32));
             }
@@ -2234,21 +2241,17 @@ impl MachInstEmit for Inst {
             }
             &Inst::MovToFpu { rd, rn, size } => {
                 let template = match size {
+                    ScalarSize::Size16 => 0b000_11110_11_1_00_111_000000_00000_00000,
                     ScalarSize::Size32 => 0b000_11110_00_1_00_111_000000_00000_00000,
                     ScalarSize::Size64 => 0b100_11110_01_1_00_111_000000_00000_00000,
                     _ => unreachable!(),
                 };
                 sink.put4(template | (machreg_to_gpr(rn) << 5) | machreg_to_vec(rd.to_reg()));
             }
             &Inst::FpuMoveFPImm { rd, imm, size } => {
-                let size_code = match size {
-                    ScalarSize::Size32 => 0b00,
-                    ScalarSize::Size64 => 0b01,
-                    _ => unimplemented!(),
-                };
                 sink.put4(
                     0b000_11110_00_1_00_000_000100_00000_00000
-                        | size_code << 22
+                        | size.ftype() << 22
                         | ((imm.enc_bits() as u32) << 13)
                         | machreg_to_vec(rd.to_reg()),
                 );

@@ -6699,6 +6699,19 @@ fn test_aarch64_binemit() {
         "fcmp d23, d24",
     ));
 
+    insns.push((
+        Inst::FpuLoad16 {
+            rd: writable_vreg(16),
+            mem: AMode::RegScaled {
+                rn: xreg(8),
+                rm: xreg(9),
+            },
+            flags: MemFlags::trusted(),
+        },
+        "1079697C",
+        "ldr h16, [x8, x9, LSL #1]",
+    ));
+
     insns.push((
         Inst::FpuLoad32 {
             rd: writable_vreg(16),
@@ -6774,6 +6787,19 @@ fn test_aarch64_binemit() {
         "ldr q16, pc+8",
     ));
 
+    insns.push((
+        Inst::FpuStore16 {
+            rd: vreg(16),
+            mem: AMode::RegScaled {
+                rn: xreg(8),
+                rm: xreg(9),
+            },
+            flags: MemFlags::trusted(),
+        },
+        "1079297C",
+        "str h16, [x8, x9, LSL #1]",
+    ));
+
     insns.push((
         Inst::FpuStore32 {
             rd: vreg(16),
@@ -6973,6 +6999,17 @@ fn test_aarch64_binemit() {
         "stp q18, q22, [sp], #304",
     ));
 
+    insns.push((
+        Inst::FpuCSel16 {
+            rd: writable_vreg(1),
+            rn: vreg(2),
+            rm: vreg(3),
+            cond: Cond::Hi,
+        },
+        "418CE31E",
+        "fcsel h1, h2, h3, hi",
+    ));
+
     insns.push((
         Inst::FpuCSel32 {
             rd: writable_vreg(1),