From 8c475e1fd504f46cbb8a882e760714925c012890 Mon Sep 17 00:00:00 2001 From: John Tsiombikas Date: Mon, 12 Feb 2018 06:54:39 +0200 Subject: [PATCH] - added optimization notes in readme - added cpuid instruction for serializing/flushing cpu state before taking rdtsc measurements - moved perf variables to util.c to silence watcom warnings --- Makefile | 2 +- README.md | 43 +++++++++++++++++++++++++++++++++++++++++++ src/util.c | 3 +++ src/util.h | 23 ++++++++++++++++++----- 4 files changed, 65 insertions(+), 6 deletions(-) create mode 100644 src/util.c diff --git a/Makefile b/Makefile index 3a84c1a..af258f8 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ demoobj = main.obj demo.obj screen.obj cfgopt.obj music.obj gfxutil.obj & scrobj = tunnel.obj fract.obj grise.obj polytest.obj plasma.obj bump.obj & thunder.obj metaball.obj greets.obj sysobj = gfx.obj vbe.obj watdpmi.obj timer.obj keyb.obj mouse.obj sball.obj & -logger.obj tinyfps.obj +logger.obj tinyfps.obj util.obj obj = $(baseobj) $(demoobj) $(sysobj) $(scrobj) bin = demo.exe diff --git a/README.md b/README.md index 16206b1..496d395 100644 --- a/README.md +++ b/README.md @@ -41,3 +41,46 @@ The demo datafiles are in their own subversion repo. To checkout the data files run the following in the demo root directory: svn co svn://mutantstargoat.com/datadirs/dosdemo data + +Random optimization details about the Pentium1 (p54c) +----------------------------------------------------- +Use cround64 (util.h) for float -> integer conversions, instead of casts. + +Performance measurement with RDTSC: + perf_start(); + /* code under test */ + perf_end(); /* result in perf_interval_count */ + +Cache organization (L1): 8kb data / 8kb instruction +128 sets of 2 cache lines, 32 bytes per cache line. + +Addresses which are multiples of 4096 fall in the same set and can only have +two of them in cache at any time. + +U/V pipe pairing rules: + - both instructions must be simple + - no read-after-write or write-after-write reg dependencies + - no displacement AND immediate in either instruction + - instr. with prefixes (except 0x0f) can only run on U pipe. + - prefixes are treated as separate 1-byte instructions (except 0x0f). + - branches can be paired if they are the second instr. of the pair only. + +Simple instructions are: + - mov reg, reg/mem/imm + - mov mem, reg/imm + - alu reg, reg/mem/imm (alu: add/sub/cmp/and/or/xor) + - alu mem, reg/imm + - inc reg/mem + - dec reg/mem + - push reg/mem + - pop reg + - lea reg,mem + - jmp/call/jcc near + - nop + - test reg,reg/mem + - test acc,imm + +U-only pairable instructions: + - adc, sbb + - shr, sar, shl, sal with immediate + - ror, rol, rcr, rcl with immediate=1 diff --git a/src/util.c b/src/util.c new file mode 100644 index 0000000..a91bb95 --- /dev/null +++ b/src/util.c @@ -0,0 +1,3 @@ +#include "util.h" + +uint32_t perf_start_count, perf_interval_count; diff --git a/src/util.h b/src/util.h index f8ae175..6d200b7 100644 --- a/src/util.h +++ b/src/util.h @@ -24,42 +24,53 @@ static INLINE int32_t cround64(double val) return *(int32_t*)&val; } -uint32_t perf_start_count, perf_interval_count; +extern uint32_t perf_start_count, perf_interval_count; #ifdef __WATCOMC__ void perf_start(void); #pragma aux perf_start = \ + "xor eax, eax" \ + "cpuid" \ "rdtsc" \ "mov [perf_start_count], eax" \ - modify[eax edx]; + modify[eax ebx ecx edx]; void perf_end(void); #pragma aux perf_end = \ + "xor eax, eax" \ + "cpuid" \ "rdtsc" \ "sub eax, [perf_start_count]" \ "mov [perf_interval_count], eax" \ - modify [eax edx]; + modify [eax ebx ecx edx]; #endif #ifdef __GNUC__ #define perf_start() asm volatile ( \ + "xor %%eax, %%eax\n" \ + "cpuid\n" \ "rdtsc\n" \ "mov %%eax, %0\n" \ - : "=m"(perf_start_count) :: "%eax", "%edx") + : "=m"(perf_start_count) \ + :: "%eax", "%ebx", "%ecx", "%edx") #define perf_end() asm volatile ( \ + "xor %%eax, %%eax\n" \ + "cpuid\n" \ "rdtsc\n" \ "sub %1, %%eax\n" \ "mov %%eax, %0\n" \ : "=m"(perf_interval_count) \ : "m"(perf_start_count) \ - : "%eax", "%edx") + : "%eax", "%ebx", "%ecx", "%edx") #endif #ifdef _MSC_VER #define perf_start() \ do { \ __asm { \ + xor eax, eax \ + cpuid \ rdtsc \ mov [perf_start_count], eax \ } \ @@ -68,6 +79,8 @@ void perf_end(void); #define perf_end() \ do { \ __asm { \ + xor eax, eax \ + cpuid \ rdtsc \ sub eax, [perf_start_count] \ mov [perf_interval_count], eax \ -- 1.7.10.4