Floating point with AVX.

This commit is contained in:
losfair 2019-04-09 01:24:30 +08:00
parent 154f7f8fd3
commit e32816b06b
3 changed files with 283 additions and 5 deletions

View File

@ -525,6 +525,7 @@ impl X64FunctionCode {
(_, Location::Imm32(_)) | (_, Location::Imm64(_)) => RelaxMode::DstToGPR,
(Location::Imm64(_), Location::Memory(_, _)) => RelaxMode::SrcToGPR,
(Location::Imm64(_), Location::GPR(_)) if (op as *const u8 != Assembler::emit_mov as *const u8) => RelaxMode::SrcToGPR,
(_, Location::XMM(_)) => RelaxMode::SrcToGPR,
_ if (op as *const u8 == Assembler::emit_imul as *const u8) => RelaxMode::BothToGPR, // TODO: optimize this
_ => RelaxMode::Direct,
};
@ -563,6 +564,75 @@ impl X64FunctionCode {
}
}
fn emit_relaxed_avx(
a: &mut Assembler,
m: &mut Machine,
op: fn(&mut Assembler, XMM, XMMOrMemory, XMM),
src1: Location,
src2: Location,
dst: Location,
) {
let tmp1 = m.acquire_temp_xmm().unwrap();
let tmp2 = m.acquire_temp_xmm().unwrap();
let tmp3 = m.acquire_temp_xmm().unwrap();
let tmpg = m.acquire_temp_gpr().unwrap();
let src1 = match src1 {
Location::XMM(x) => x,
Location::GPR(_) | Location::Memory(_, _) => {
a.emit_mov(Size::S64, src1, Location::XMM(tmp1));
tmp1
}
Location::Imm32(_) => {
a.emit_mov(Size::S32, src1, Location::GPR(tmpg));
a.emit_mov(Size::S32, Location::GPR(tmpg), Location::XMM(tmp1));
tmp1
}
Location::Imm64(_) => {
a.emit_mov(Size::S64, src1, Location::GPR(tmpg));
a.emit_mov(Size::S64, Location::GPR(tmpg), Location::XMM(tmp1));
tmp1
}
_ => unreachable!()
};
let src2 = match src2 {
Location::XMM(x) => XMMOrMemory::XMM(x),
Location::Memory(base, disp) => XMMOrMemory::Memory(base, disp),
Location::GPR(_) => {
a.emit_mov(Size::S64, src2, Location::XMM(tmp2));
XMMOrMemory::XMM(tmp2)
}
Location::Imm32(_) => {
a.emit_mov(Size::S32, src2, Location::GPR(tmpg));
a.emit_mov(Size::S32, Location::GPR(tmpg), Location::XMM(tmp2));
XMMOrMemory::XMM(tmp2)
}
Location::Imm64(_) => {
a.emit_mov(Size::S64, src2, Location::GPR(tmpg));
a.emit_mov(Size::S64, Location::GPR(tmpg), Location::XMM(tmp2));
XMMOrMemory::XMM(tmp2)
}
_ => unreachable!()
};
match dst {
Location::XMM(x) => {
op(a, src1, src2, x);
},
Location::Memory(_, _) => {
op(a, src1, src2, tmp3);
a.emit_mov(Size::S64, Location::XMM(tmp3), dst);
},
_ => unreachable!(),
}
m.release_temp_gpr(tmpg);
m.release_temp_xmm(tmp3);
m.release_temp_xmm(tmp2);
m.release_temp_xmm(tmp1);
}
fn emit_binop_i32(
a: &mut Assembler,
m: &mut Machine,
@ -849,6 +919,33 @@ impl X64FunctionCode {
value_stack.push((ret, LocalOrTemp::Temp));
}
fn emit_fp_binop_avx(
a: &mut Assembler,
m: &mut Machine,
value_stack: &mut Vec<(Location, LocalOrTemp)>,
f: fn(&mut Assembler, XMM, XMMOrMemory, XMM),
) {
let loc_b = get_location_released(a, m, value_stack.pop().unwrap());
let loc_a = get_location_released(a, m, value_stack.pop().unwrap());
let ret = m.acquire_locations(a, &[WpType::F64], false)[0];
value_stack.push((ret, LocalOrTemp::Temp));
Self::emit_relaxed_avx(a, m, f, loc_a, loc_b, ret);
}
fn emit_fp_unop_avx(
a: &mut Assembler,
m: &mut Machine,
value_stack: &mut Vec<(Location, LocalOrTemp)>,
f: fn(&mut Assembler, XMM, XMMOrMemory, XMM),
) {
let loc = get_location_released(a, m, value_stack.pop().unwrap());
let ret = m.acquire_locations(a, &[WpType::F64], false)[0];
value_stack.push((ret, LocalOrTemp::Temp));
Self::emit_relaxed_avx(a, m, f, loc, loc, ret);
}
// This function must not use any temporary register before `cb` is called.
fn emit_call_sysv<I: Iterator<Item = Location>, F: FnOnce(&mut Assembler)>(a: &mut Assembler, m: &mut Machine, cb: F, params: I) {
let params: Vec<_> = params.collect();
@ -859,6 +956,15 @@ impl X64FunctionCode {
a.emit_push(Size::S64, Location::GPR(*r));
}
// Save used XMM registers.
let used_xmms = m.get_used_xmms();
if used_xmms.len() > 0 {
a.emit_sub(Size::S64, Location::Imm32((used_xmms.len() * 8) as u32), Location::GPR(GPR::RSP));
for (i, r) in used_xmms.iter().enumerate() {
a.emit_mov(Size::S64, Location::XMM(*r), Location::Memory(GPR::RSP, (i * 8) as i32));
}
}
let mut stack_offset: usize = 0;
// Calculate stack offset.
@ -914,6 +1020,14 @@ impl X64FunctionCode {
a.emit_add(Size::S64, Location::Imm32(stack_offset as u32), Location::GPR(GPR::RSP));
}
// Restore XMMs.
if used_xmms.len() > 0 {
for (i, r) in used_xmms.iter().enumerate() {
a.emit_mov(Size::S64, Location::Memory(GPR::RSP, (i * 8) as i32), Location::XMM(*r));
}
a.emit_add(Size::S64, Location::Imm32((used_xmms.len() * 8) as u32), Location::GPR(GPR::RSP));
}
// Restore GPRs.
for r in used_gprs.iter().rev() {
a.emit_pop(Size::S64, Location::GPR(*r));
@ -1155,7 +1269,7 @@ impl FunctionCodeGenerator for X64FunctionCode {
let loc_a = get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap());
let ret = self.machine.acquire_locations(a, &[WpType::I32], false)[0];
a.emit_mov(Size::S32, loc_a, Location::GPR(GPR::RAX));
a.emit_xor(Size::S32, Location::GPR(GPR::RDX), Location::GPR(GPR::RDX));
a.emit_cdq();
Self::emit_relaxed_xdiv(a, &mut self.machine, Assembler::emit_idiv, Size::S32, loc_b);
a.emit_mov(Size::S32, Location::GPR(GPR::RAX), ret);
self.value_stack.push((ret, LocalOrTemp::Temp));
@ -1232,7 +1346,7 @@ impl FunctionCodeGenerator for X64FunctionCode {
let loc_a = get_location_released(a, &mut self.machine, self.value_stack.pop().unwrap());
let ret = self.machine.acquire_locations(a, &[WpType::I64], false)[0];
a.emit_mov(Size::S64, loc_a, Location::GPR(GPR::RAX));
a.emit_xor(Size::S64, Location::GPR(GPR::RDX), Location::GPR(GPR::RDX));
a.emit_cqo();
Self::emit_relaxed_xdiv(a, &mut self.machine, Assembler::emit_idiv, Size::S64, loc_b);
a.emit_mov(Size::S64, Location::GPR(GPR::RAX), ret);
self.value_stack.push((ret, LocalOrTemp::Temp));
@ -1308,6 +1422,45 @@ impl FunctionCodeGenerator for X64FunctionCode {
Size::S32, loc, ret,
);
}
Operator::F32Const { value } => self.value_stack.push((Location::Imm32(value.bits()), LocalOrTemp::Temp)),
Operator::F32Add => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vaddss),
Operator::F32Sub => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vsubss),
Operator::F32Mul => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vmulss),
Operator::F32Div => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vdivss),
Operator::F32Max => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vmaxss),
Operator::F32Min => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vminss),
Operator::F32Eq => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpeqss),
Operator::F32Ne => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpneqss),
Operator::F32Lt => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpltss),
Operator::F32Le => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpless),
Operator::F32Gt => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpgtss),
Operator::F32Ge => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpgess),
Operator::F32Nearest => Self::emit_fp_unop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vroundss_nearest),
Operator::F32Floor => Self::emit_fp_unop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vroundss_floor),
Operator::F32Ceil => Self::emit_fp_unop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vroundss_ceil),
Operator::F32Trunc => Self::emit_fp_unop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vroundss_trunc),
Operator::F32Sqrt => Self::emit_fp_unop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vsqrtss),
Operator::F64Const { value } => self.value_stack.push((Location::Imm64(value.bits()), LocalOrTemp::Temp)),
Operator::F64Add => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vaddsd),
Operator::F64Sub => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vsubsd),
Operator::F64Mul => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vmulsd),
Operator::F64Div => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vdivsd),
Operator::F64Max => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vmaxsd),
Operator::F64Min => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vminsd),
Operator::F64Eq => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpeqsd),
Operator::F64Ne => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpneqsd),
Operator::F64Lt => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpltsd),
Operator::F64Le => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmplesd),
Operator::F64Gt => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpgtsd),
Operator::F64Ge => Self::emit_fp_binop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vcmpgesd),
Operator::F64Nearest => Self::emit_fp_unop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vroundsd_nearest),
Operator::F64Floor => Self::emit_fp_unop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vroundsd_floor),
Operator::F64Ceil => Self::emit_fp_unop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vroundsd_ceil),
Operator::F64Trunc => Self::emit_fp_unop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vroundsd_trunc),
Operator::F64Sqrt => Self::emit_fp_unop_avx(a, &mut self.machine, &mut self.value_stack, Assembler::emit_vsqrtsd),
Operator::Call { function_index } => {
let function_index = function_index as usize;
let label = self

View File

@ -69,6 +69,12 @@ pub enum Size {
S64,
}
#[derive(Copy, Clone, Debug)]
pub enum XMMOrMemory {
XMM(XMM),
Memory(GPR, i32),
}
pub trait Emitter {
type Label;
type Offset;
@ -81,6 +87,8 @@ pub trait Emitter {
fn emit_mov(&mut self, sz: Size, src: Location, dst: Location);
fn emit_lea(&mut self, sz: Size, src: Location, dst: Location);
fn emit_lea_label(&mut self, label: Self::Label, dst: Location);
fn emit_cdq(&mut self);
fn emit_cqo(&mut self);
fn emit_xor(&mut self, sz: Size, src: Location, dst: Location);
fn emit_jmp(&mut self, condition: Condition, label: Self::Label);
fn emit_jmp_location(&mut self, loc: Location);
@ -108,6 +116,49 @@ pub trait Emitter {
fn emit_movzx(&mut self, sz_src: Size, src: Location, sz_dst: Size, dst: Location);
fn emit_movsx(&mut self, sz_src: Size, src: Location, sz_dst: Size, dst: Location);
fn emit_vaddss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vaddsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vsubss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vsubsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vmulss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vmulsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vdivss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vdivsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vmaxss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vmaxsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vminss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vminsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vcmpeqss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vcmpeqsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vcmpneqss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vcmpneqsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vcmpltss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vcmpltsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vcmpless(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vcmplesd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vcmpgtss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vcmpgtsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vcmpgess(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vcmpgesd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vsqrtss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vsqrtsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vroundss_nearest(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vroundss_floor(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vroundss_ceil(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vroundss_trunc(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vroundsd_nearest(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vroundsd_floor(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vroundsd_ceil(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_vroundsd_trunc(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM);
fn emit_ud2(&mut self);
fn emit_ret(&mut self);
fn emit_call_label(&mut self, label: Self::Label);
@ -306,6 +357,28 @@ macro_rules! trap_op {
}
}
macro_rules! avx_fn {
($ins:ident, $name:ident) => {
fn $name(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
match src2 {
XMMOrMemory::XMM(x) => dynasm!(self ; $ins Rx((dst as u8)), Rx((src1 as u8)), Rx((x as u8))),
XMMOrMemory::Memory(base, disp) => dynasm!(self ; $ins Rx((dst as u8)), Rx((src1 as u8)), [Rq((base as u8)) + disp]),
}
}
}
}
macro_rules! avx_round_fn {
($ins:ident, $name:ident, $mode:expr) => {
fn $name(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
match src2 {
XMMOrMemory::XMM(x) => dynasm!(self ; $ins Rx((dst as u8)), Rx((src1 as u8)), Rx((x as u8)), $mode),
XMMOrMemory::Memory(base, disp) => dynasm!(self ; $ins Rx((dst as u8)), Rx((src1 as u8)), [Rq((base as u8)) + disp], $mode),
}
}
}
}
impl Emitter for Assembler {
type Label = DynamicLabel;
type Offset = AssemblyOffset;
@ -386,6 +459,12 @@ impl Emitter for Assembler {
_ => unreachable!(),
}
}
fn emit_cdq(&mut self) {
dynasm!(self ; cdq);
}
fn emit_cqo(&mut self) {
dynasm!(self ; cqo);
}
fn emit_xor(&mut self, sz: Size, src: Location, dst: Location) {
binop_all_nofp!(xor, self, sz, src, dst, {unreachable!()});
}
@ -582,6 +661,54 @@ impl Emitter for Assembler {
}
}
avx_fn!(vaddss, emit_vaddss);
avx_fn!(vaddsd, emit_vaddsd);
avx_fn!(vsubss, emit_vsubss);
avx_fn!(vsubsd, emit_vsubsd);
avx_fn!(vmulss, emit_vmulss);
avx_fn!(vmulsd, emit_vmulsd);
avx_fn!(vdivss, emit_vdivss);
avx_fn!(vdivsd, emit_vdivsd);
avx_fn!(vmaxss, emit_vmaxss);
avx_fn!(vmaxsd, emit_vmaxsd);
avx_fn!(vminss, emit_vminss);
avx_fn!(vminsd, emit_vminsd);
avx_fn!(vcmpeqss, emit_vcmpeqss);
avx_fn!(vcmpeqsd, emit_vcmpeqsd);
avx_fn!(vcmpneqss, emit_vcmpneqss);
avx_fn!(vcmpneqsd, emit_vcmpneqsd);
avx_fn!(vcmpltss, emit_vcmpltss);
avx_fn!(vcmpltsd, emit_vcmpltsd);
avx_fn!(vcmpless, emit_vcmpless);
avx_fn!(vcmplesd, emit_vcmplesd);
avx_fn!(vcmpgtss, emit_vcmpgtss);
avx_fn!(vcmpgtsd, emit_vcmpgtsd);
avx_fn!(vcmpgess, emit_vcmpgess);
avx_fn!(vcmpgesd, emit_vcmpgesd);
avx_fn!(vsqrtss, emit_vsqrtss);
avx_fn!(vsqrtsd, emit_vsqrtsd);
avx_round_fn!(vroundss, emit_vroundss_nearest, 0);
avx_round_fn!(vroundss, emit_vroundss_floor, 1);
avx_round_fn!(vroundss, emit_vroundss_ceil, 2);
avx_round_fn!(vroundss, emit_vroundss_trunc, 3);
avx_round_fn!(vroundsd, emit_vroundsd_nearest, 0);
avx_round_fn!(vroundsd, emit_vroundsd_floor, 1);
avx_round_fn!(vroundsd, emit_vroundsd_ceil, 2);
avx_round_fn!(vroundsd, emit_vroundsd_trunc, 3);
fn emit_ud2(&mut self) {
dynasm!(self ; ud2);
}

View File

@ -186,9 +186,7 @@ impl Machine {
for ty in tys {
let loc = match *ty {
WpType::F32 | WpType::F64 => {
self.pick_xmm().map(Location::XMM).or_else(
|| self.pick_gpr().map(Location::GPR)
)
self.pick_xmm().map(Location::XMM)
},
WpType::I32 | WpType::I64 => {
self.pick_gpr().map(Location::GPR)