- added optimization notes in readme
authorJohn Tsiombikas <nuclear@member.fsf.org>
Mon, 12 Feb 2018 04:54:39 +0000 (06:54 +0200)
committerJohn Tsiombikas <nuclear@member.fsf.org>
Mon, 12 Feb 2018 04:58:04 +0000 (06:58 +0200)
- added cpuid instruction for serializing/flushing cpu state before
  taking rdtsc measurements
- moved perf variables to util.c to silence watcom warnings

Makefile
README.md
src/util.c [new file with mode: 0644]
src/util.h

index 3a84c1a..af258f8 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ demoobj = main.obj demo.obj screen.obj cfgopt.obj music.obj gfxutil.obj &
 scrobj = tunnel.obj fract.obj grise.obj polytest.obj plasma.obj bump.obj &
 thunder.obj metaball.obj greets.obj
 sysobj = gfx.obj vbe.obj watdpmi.obj timer.obj keyb.obj mouse.obj sball.obj &
 scrobj = tunnel.obj fract.obj grise.obj polytest.obj plasma.obj bump.obj &
 thunder.obj metaball.obj greets.obj
 sysobj = gfx.obj vbe.obj watdpmi.obj timer.obj keyb.obj mouse.obj sball.obj &
-logger.obj tinyfps.obj
+logger.obj tinyfps.obj util.obj
 obj = $(baseobj) $(demoobj) $(sysobj) $(scrobj)
 bin = demo.exe
 
 obj = $(baseobj) $(demoobj) $(sysobj) $(scrobj)
 bin = demo.exe
 
index 16206b1..496d395 100644 (file)
--- a/README.md
+++ b/README.md
@@ -41,3 +41,46 @@ The demo datafiles are in their own subversion repo. To checkout the data files
 run the following in the demo root directory:
 
   svn co svn://mutantstargoat.com/datadirs/dosdemo data
 run the following in the demo root directory:
 
   svn co svn://mutantstargoat.com/datadirs/dosdemo data
+
+Random optimization details about the Pentium1 (p54c)
+-----------------------------------------------------
+Use cround64 (util.h) for float -> integer conversions, instead of casts.
+
+Performance measurement with RDTSC:
+    perf_start();
+    /* code under test */
+    perf_end(); /* result in perf_interval_count */
+
+Cache organization (L1): 8kb data / 8kb instruction
+128 sets of 2 cache lines, 32 bytes per cache line.
+
+Addresses which are multiples of 4096 fall in the same set and can only have
+two of them in cache at any time.
+
+U/V pipe pairing rules:
+ - both instructions must be simple
+ - no read-after-write or write-after-write reg dependencies
+ - no displacement AND immediate in either instruction
+ - instr. with prefixes (except 0x0f) can only run on U pipe.
+ - prefixes are treated as separate 1-byte instructions (except 0x0f).
+ - branches can be paired if they are the second instr. of the pair only.
+
+Simple instructions are:
+ - mov reg, reg/mem/imm
+ - mov mem, reg/imm
+ - alu reg, reg/mem/imm (alu: add/sub/cmp/and/or/xor)
+ - alu mem, reg/imm
+ - inc reg/mem
+ - dec reg/mem
+ - push reg/mem
+ - pop reg
+ - lea reg,mem
+ - jmp/call/jcc near
+ - nop
+ - test reg,reg/mem
+ - test acc,imm
+
+U-only pairable instructions:
+ - adc, sbb
+ - shr, sar, shl, sal with immediate
+ - ror, rol, rcr, rcl with immediate=1
diff --git a/src/util.c b/src/util.c
new file mode 100644 (file)
index 0000000..a91bb95
--- /dev/null
@@ -0,0 +1,3 @@
+#include "util.h"
+
+uint32_t perf_start_count, perf_interval_count;
index f8ae175..6d200b7 100644 (file)
@@ -24,42 +24,53 @@ static INLINE int32_t cround64(double val)
        return *(int32_t*)&val;
 }
 
        return *(int32_t*)&val;
 }
 
-uint32_t perf_start_count, perf_interval_count;
+extern uint32_t perf_start_count, perf_interval_count;
 
 #ifdef __WATCOMC__
 void perf_start(void);
 #pragma aux perf_start = \
 
 #ifdef __WATCOMC__
 void perf_start(void);
 #pragma aux perf_start = \
+       "xor eax, eax" \
+       "cpuid" \
        "rdtsc" \
        "mov [perf_start_count], eax" \
        "rdtsc" \
        "mov [perf_start_count], eax" \
-       modify[eax edx];
+       modify[eax ebx ecx edx];
 
 void perf_end(void);
 #pragma aux perf_end = \
 
 void perf_end(void);
 #pragma aux perf_end = \
+       "xor eax, eax" \
+       "cpuid" \
        "rdtsc" \
        "sub eax, [perf_start_count]" \
        "mov [perf_interval_count], eax" \
        "rdtsc" \
        "sub eax, [perf_start_count]" \
        "mov [perf_interval_count], eax" \
-       modify [eax edx];
+       modify [eax ebx ecx edx];
 #endif
 
 #ifdef __GNUC__
 #define perf_start()  asm volatile ( \
 #endif
 
 #ifdef __GNUC__
 #define perf_start()  asm volatile ( \
+       "xor %%eax, %%eax\n" \
+       "cpuid\n" \
        "rdtsc\n" \
        "mov %%eax, %0\n" \
        "rdtsc\n" \
        "mov %%eax, %0\n" \
-       : "=m"(perf_start_count) :: "%eax", "%edx")
+       : "=m"(perf_start_count) \
+       :: "%eax", "%ebx", "%ecx", "%edx")
 
 #define perf_end() asm volatile ( \
 
 #define perf_end() asm volatile ( \
+       "xor %%eax, %%eax\n" \
+       "cpuid\n" \
        "rdtsc\n" \
        "sub %1, %%eax\n" \
        "mov %%eax, %0\n" \
        : "=m"(perf_interval_count) \
        : "m"(perf_start_count) \
        "rdtsc\n" \
        "sub %1, %%eax\n" \
        "mov %%eax, %0\n" \
        : "=m"(perf_interval_count) \
        : "m"(perf_start_count) \
-       : "%eax", "%edx")
+       : "%eax", "%ebx", "%ecx", "%edx")
 #endif
 
 #ifdef _MSC_VER
 #define perf_start() \
        do { \
                __asm { \
 #endif
 
 #ifdef _MSC_VER
 #define perf_start() \
        do { \
                __asm { \
+                       xor eax, eax \
+                       cpuid \
                        rdtsc \
                        mov [perf_start_count], eax \
                } \
                        rdtsc \
                        mov [perf_start_count], eax \
                } \
@@ -68,6 +79,8 @@ void perf_end(void);
 #define perf_end() \
        do { \
                __asm { \
 #define perf_end() \
        do { \
                __asm { \
+                       xor eax, eax \
+                       cpuid \
                        rdtsc \
                        sub eax, [perf_start_count] \
                        mov [perf_interval_count], eax \
                        rdtsc \
                        sub eax, [perf_start_count] \
                        mov [perf_interval_count], eax \