Floor
A C++ Compute/Graphics Library and Toolchain enabling same-source CUDA/Host/Metal/OpenCL/Vulkan C++ programming and execution.
Install / Use
/learn @a2flo/FloorREADME
:toc:
= Flo's Open libRary =
== What is it? ==
This project provides a unified compute & graphics host API, as well as a unified compute & graphics C++ device language and library to enable same-source CUDA/Host/Metal/OpenCL/Vulkan programming and execution.
The unified host API is implemented at link:https://github.com/a2flo/floor/tree/master/src/device[device]. All backends (CUDA/Host/Metal/OpenCL/Vulkan) currently provide compute support, while graphics support is limited to Metal and Vulkan.
To provide a unified device language, a clang/LLVM/libc++ 14.0 toolchain has been link:https://github.com/a2flo/floor_llvm[modified].
Certain parts of libfloor are used by both host and device code (link:https://github.com/a2flo/floor/tree/master/include/floor/math[math] and link:https://github.com/a2flo/floor/tree/master/include/floor/constexpr[constexpr]). Additional device library code is located at link:https://github.com/a2flo/floor/tree/master/include/floor/device/backend[backend].
Advanced examples can be found in the link:https://github.com/a2flo/floor_examples[floor_examples] repository.
=== Example === Let's take this fairly simple C++ kernel below that computes the body/body-interactions in a link:https://www.youtube.com/watch?v=DoLe1c-eokI[N-body simulation] and compile it for each backend. Note that loop unrolling is omitted for conciseness. [source,c++]
// define global constants static constexpr constant const uint32_t NBODY_TILE_SIZE { 256u }; static constexpr constant const float NBODY_DAMPING { 0.999f }; static constexpr constant const float NBODY_SOFTENING { 0.01f }; // define a 1D kernel with a required local size of (NBODY_TILE_SIZE = 256, 1, 1) kernel_1d(NBODY_TILE_SIZE) void simplified_nbody(buffer<const float4> in_positions, // read-only global memory buffer buffer<float4> out_positions, // read-write global memory buffer buffer<float3> inout_velocities, // read-write global memory buffer param<float> time_delta) { // read-only parameter // each work-item represents/computes one body const auto position = in_positions[global_id.x]; auto velocity = inout_velocities[global_id.x]; float3 acceleration; // vectors are automatically zero-initialized local_buffer<float4, NBODY_TILE_SIZE> local_body_positions; // local memory array allocation // loop over all bodies for (uint32_t i = 0, tile = 0, count = global_size.x; i < count; i += NBODY_TILE_SIZE, ++tile) { // move resp. body position/mass from global to local memory local_body_positions[local_id.x] = in_positions[tile * NBODY_TILE_SIZE + local_id.x]; local_barrier(); // barrier across all work-items in this work-group // loop over bodies in this work-group for (uint32_t j = 0; j < NBODY_TILE_SIZE; ++j) { const auto r = local_body_positions[j].xyz - position.xyz; const auto dist_sq = r.dot(r) + (NBODY_SOFTENING * NBODY_SOFTENING); const auto inv_dist = rsqrt(dist_sq); const auto s = local_body_positions[j].w * (inv_dist * inv_dist * inv_dist); // .w is mass acceleration += r * s; } local_barrier(); } velocity = (velocity + acceleration * time_delta) * NBODY_DAMPING; out_positions[global_id.x].xyz += velocity * time_delta; // update XYZ position inout_velocities[global_id.x] = velocity; // update velocity }
click to unfold the output for each backend ++++
<details> <summary>CUDA / PTX</summary> You can download the PTX file <a href="https://github.com/a2flo/floor/blob/master/etc/example/nbody.ptx">here</a> and the CUBIN file <a href="https://github.com/a2flo/floor/blob/master/etc/example/nbody.cubin">here</a> (note that building CUBINs is optional and requires <code>ptxas</code>).++++ [source,Unix Assembly]
// // Generated by LLVM NVPTX Back-End //
.version 8.4 .target sm_86 .address_size 64
// .globl simplified_nbody
// _ZZ16simplified_nbodyE20local_body_positions has been demoted
.visible .entry simplified_nbody( .param .u64 simplified_nbody_param_0, .param .u64 simplified_nbody_param_1, .param .u64 simplified_nbody_param_2, .param .f32 simplified_nbody_param_3 ) .reqntid 256, 1, 1 { .reg .pred %p<3>; .reg .b32 %r<25>; .reg .f32 %f<71>; .reg .b64 %rd<18>; // demoted variable .shared .align 4 .b8 _ZZ16simplified_nbodyE20local_body_positions[4096]; mov.u32 %r1, %tid.x; mov.u32 %r11, %ntid.x; mov.u32 %r12, %ctaid.x; mad.lo.s32 %r13, %r12, %r11, %r1; cvt.u64.u32 %rd3, %r13; mul.wide.u32 %rd7, %r13, 12; ld.param.u64 %rd8, [simplified_nbody_param_2]; cvta.to.global.u64 %rd9, %rd8; add.s64 %rd4, %rd9, %rd7; ld.global.f32 %f6, [%rd4+8]; add.s64 %rd6, %rd4, 8; ld.global.f32 %f5, [%rd4+4]; add.s64 %rd5, %rd4, 4; ld.global.f32 %f4, [%rd4]; mul.wide.u32 %rd10, %r13, 16; ld.param.u64 %rd11, [simplified_nbody_param_0]; cvta.to.global.u64 %rd2, %rd11; add.s64 %rd12, %rd2, %rd10; ld.global.nc.f32 %f3, [%rd12+8]; ld.global.nc.f32 %f2, [%rd12+4]; ld.global.nc.f32 %f1, [%rd12]; mov.u32 %r14, %nctaid.x; mul.lo.s32 %r2, %r14, %r11; shl.b32 %r15, %r1, 4; mov.u32 %r16, _ZZ16simplified_nbodyE20local_body_positions; add.s32 %r3, %r16, %r15; ld.param.u64 %rd13, [simplified_nbody_param_1]; cvta.to.global.u64 %rd1, %rd13; mov.f32 %f68, 0f00000000; mov.u32 %r10, 0; ld.param.f32 %f16, [simplified_nbody_param_3]; mov.u32 %r22, %r10; mov.u32 %r23, %r10; mov.f32 %f69, %f68; mov.f32 %f70, %f68; LBB0_1: shl.b32 %r18, %r23, 8; add.s32 %r19, %r18, %r1; mul.wide.u32 %rd14, %r19, 16; add.s64 %rd15, %rd2, %rd14; ld.global.nc.f32 %f18, [%rd15]; st.shared.f32 [%r3], %f18; ld.global.nc.f32 %f19, [%rd15+4]; st.shared.f32 [%r3+4], %f19; ld.global.nc.f32 %f20, [%rd15+8]; st.shared.f32 [%r3+8], %f20; ld.global.nc.f32 %f21, [%rd15+12]; st.shared.f32 [%r3+12], %f21; barrier.sync 0; mov.u32 %r24, %r10; LBB0_2: add.s32 %r21, %r16, %r24; ld.shared.f32 %f22, [%r21+4]; sub.f32 %f23, %f22, %f2; ld.shared.f32 %f24, [%r21]; sub.f32 %f25, %f24, %f1; fma.rn.f32 %f26, %f25, %f25, 0f38D1B717; fma.rn.f32 %f27, %f23, %f23, %f26; ld.shared.f32 %f28, [%r21+8]; sub.f32 %f29, %f28, %f3; fma.rn.f32 %f30, %f29, %f29, %f27; rsqrt.approx.ftz.f32 %f31, %f30; mul.f32 %f32, %f31, %f31; mul.f32 %f33, %f32, %f31; ld.shared.f32 %f34, [%r21+12]; mul.f32 %f35, %f33, %f34; fma.rn.f32 %f36, %f35, %f29, %f68; ld.shared.f32 %f37, [%r21+20]; sub.f32 %f38, %f37, %f2; ld.shared.f32 %f39, [%r21+16]; sub.f32 %f40, %f39, %f1; fma.rn.f32 %f41, %f40, %f40, 0f38D1B717; fma.rn.f32 %f42, %f38, %f38, %f41; ld.shared.f32 %f43, [%r21+24]; sub.f32 %f44, %f43, %f3; fma.rn.f32 %f45, %f44, %f44, %f42; rsqrt.approx.ftz.f32 %f46, %f45; mul.f32 %f47, %f46, %f46; mul.f32 %f48, %f47, %f46; ld.shared.f32 %f49, [%r21+28]; mul.f32 %f50, %f48, %f49; fma.rn.f32 %f68, %f50, %f44, %f36; fma.rn.f32 %f51, %f35, %f23, %f69; fma.rn.f32 %f69, %f50, %f38, %f51; fma.rn.f32 %f52, %f35, %f25, %f70; fma.rn.f32 %f70, %f50, %f40, %f52; add.s32 %r24, %r24, 32; setp.eq.s32 %p1, %r24, 4096; @%p1 bra LBB0_3; bra.uni LBB0_2; LBB0_3: add.s32 %r22, %r22, 256; setp.lt.u32 %p2, %r22, %r2; barrier.sync 0; add.s32 %r23, %r23, 1; @%p2 bra LBB0_1; fma.rn.f32 %f53, %f70, %f16, %f4; mul.f32 %f54, %f53, 0f3F7FBE77; shl.b64 %rd16, %rd3, 4; add.s64 %rd17, %rd1, %rd16; ld.global.f32 %f55, [%rd17]; fma.rn.f32 %f56, %f54, %f16, %f55; st.global.f32 [%rd17], %f56; fma.rn.f32 %f57, %f69, %f16, %f5; mul.f32 %f58, %f57, 0f3F7FBE77; ld.global.f32 %f59, [%rd17+4]; fma.rn.f32 %f60, %f58, %f16, %f59; st.global.f32 [%rd17+4], %f60; fma.rn.f32 %f61, %f68, %f16, %f6; mul.f32 %f62, %f61, 0f3F7FBE77; ld.global.f32 %f63, [%rd17+8]; fma.rn.f32 %f64, %f62, %f16, %f63; st.global.f32 [%rd17+8], %f64; st.global.f32 [%rd4], %f54; st.global.f32 [%rd5], %f58; st.global.f32 [%rd6], %f62; ret;
}
++++ </code></pre>
</details> <details> <summary>Host-Compute (x86 CPU)</summary> Note that the compiler would usually directly output a <a href="https://github.com/a2flo/floor/blob/master/etc/example/nbody_x86_64.bin">.bin file</a> (ELF format). The output below comes from disassembling it with <code>objdump -d</code>. Also note that this has been compiled for the <a href="https://github.com/a2flo/floor/blob/master/compute/host/host_common.hpp#L44"><code>x86-5</code> target</a> (AVX-512+).++++ [source,Assembly]
nbody.bin: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <simplified_nbody>: 0: 55 push %rbp 1: 48 89 e5 mov %rsp,%rbp 4: 41 57 push %r15 6: 41 56 push %r14 8: 41 55 push %r13 a: 41 54 push %r12 c: 53 push %rbx d: 48 83 e4 c0 and $0xffffffffffffffc0,%rsp 11: 48 81 ec 40 09 00 00 sub $0x940,%rsp 18: 48 8d 05 f9 ff ff ff lea -0x7(%rip),%rax # 18 <simplified_nbody+0x18> 1f: 49 be 00 00 00 00 00 movabs $0x0,%r14 26: 00 00 00 29: 48 89 4c 24 50 mov %rcx,0x50(%rsp) 2e: 48 89 74 24 68 mov %rsi,0x68(%rsp) 33: 48 89 7c 24 48 mov %rdi,0x48(%rsp) 38: 49 01 c6 add %rax,%r14 3b: 48 b8 00 00 00 00 00 movabs $0x0,%rax 42: 00 00 00 45: 49 8b 04 06 mov (%r14,%rax,1),%rax 49: 8b 00 mov (%rax),%eax 4b: 48 8d 0c 40 lea (%rax,%rax,2),%rcx 4f: 48 89 c6 mov %rax,%rsi 52: 48 c1 e6 04 shl $0x4,%rsi 56: 48 89 74 24 58 mov %rsi,0x58(%rsp) 5b: 48 8d 04 8a lea (%rdx,%rcx,4),%rax 5f: c5 fa 10 04 8a vmovss (%rdx,%rcx,4),%xmm0 64: c5 f9 6e 54 8a 04 vmovd 0x4(%rdx,%rcx,4),%xmm2 6a: c5 fa 10 4c
Related Skills
node-connect
346.8kDiagnose OpenClaw node connection and pairing failures for Android, iOS, and macOS companion apps
frontend-design
107.6kCreate distinctive, production-grade frontend interfaces with high design quality. Use this skill when the user asks to build web components, pages, or applications. Generates creative, polished code that avoids generic AI aesthetics.
openai-whisper-api
346.8kTranscribe audio via OpenAI Audio Transcriptions API (Whisper).
qqbot-media
346.8kQQBot 富媒体收发能力。使用 <qqmedia> 标签,系统根据文件扩展名自动识别类型(图片/语音/视频/文件)。
