#include #include #include #include #include #include #include void micro(int* dst, int* src) { asm volatile( "movq %1, %%rax \n\t" "vmovdqu64 0x0(%%rax), %%zmm1 \n\t" "vmovdqu64 0x40(%%rax), %%zmm2 \n\t" "vmovdqu64 0x80(%%rax), %%zmm3 \n\t" "vmovdqu64 0xC0(%%rax), %%zmm4 \n\t" "movq %0, %%rax \n\t" "vmovdqu64 %%zmm1, 0x0(%%rax) \n\t" "vmovdqu64 %%zmm2, 0x40(%%rax) \n\t" "vmovdqu64 %%zmm3, 0x80(%%rax) \n\t" "vmovdqu64 %%zmm4, 0xC0(%%rax) \n\t" : "+r"(dst) : "r"(src) : "%rax", "%zmm1", "%zmm2", "%zmm3", "%zmm4" ); } int main(int argc, char * argv[]) { if (argc != 3) { return -1; } int load_offset = atoi(argv[1]); int store_offset = atoi(argv[2]); assert(load_offset >= 0 && load_offset < 64); assert(store_offset >= 0 && store_offset < 64); int* src = (int*)aligned_alloc(64, 4096 + 64); int* dst = (int*)aligned_alloc(64, 4096 + 64); memset(src, 1, sizeof(4096 + 64)); dst = reinterpret_cast(reinterpret_cast(dst) + store_offset); src = reinterpret_cast(reinterpret_cast(src) + load_offset); int* orig_dst = dst; int* orig_src = src; for (int i = 0; i < 1000000; i++) { for (int j = 0; j < 4096; j += 64 * 4) { micro(dst, src); dst = reinterpret_cast(reinterpret_cast(dst) + 64 * 4); src = reinterpret_cast(reinterpret_cast(src) + 64 * 4); } dst = orig_dst; src = orig_src; } auto start = std::chrono::high_resolution_clock::now(); for (int i = 0; i < 1000000; i++) { for (int j = 0; j < 4096; j += 64 * 4) { micro(dst, src); dst = reinterpret_cast(reinterpret_cast(dst) + 64 * 4); src = reinterpret_cast(reinterpret_cast(src) + 64 * 4); } dst = orig_dst; src = orig_src; } auto end = std::chrono::high_resolution_clock::now(); auto diff = std::chrono::duration_cast(end - start); std::cout << "[time] " << diff.count() << " ms" << std::endl; return 0; }