From f0cf514eb3ca30c5170e534c3861ad73996c7726 Mon Sep 17 00:00:00 2001 From: johannst Date: Tue, 22 Aug 2023 21:38:08 +0000 Subject: deploy: 9bb639287cae88b32fc1b17b7a4b494340e54434 --- development/c++.html | 188 ++++++++++++++++++++++- development/c++filt.html | 2 +- development/gcc.html | 2 +- development/gcov.html | 6 +- development/glibc.html | 2 +- development/index.html | 3 +- development/ld.so.html | 2 +- development/make.html | 2 +- development/pgo.html | 363 +++++++++++++++++++++++++++++++++++++++++++++ development/python.html | 2 +- development/symbolver.html | 3 +- 11 files changed, 563 insertions(+), 12 deletions(-) create mode 100644 development/pgo.html (limited to 'development') diff --git a/development/c++.html b/development/c++.html index 94cc7fd..9e296dd 100644 --- a/development/c++.html +++ b/development/c++.html @@ -83,7 +83,7 @@ @@ -170,6 +170,7 @@

c++

+

openstd cpp standards.

Source files of most examples is available here.

Type deduction

Force compile error to see what auto is deduced to.

@@ -178,6 +179,191 @@ // force compile error typename decltype(foo)::_; +

Strict aliasing and type punning

+

The strict aliasing rules describe via which alias a value can be accessed.

+
+

Informal: an alias is a reference / pointer to a value.

+
+

Accessing a value through an alias that violates the strict aliasing rules is +undefined behavior (UB).

+

Examples below on godbolt.

+
int i = 0;
+
+// Valid aliasing (signed / unsigned type).
+*reinterpret_cast<signed int*>(&i);
+*reinterpret_cast<unsigned int*>(&i);
+
+// Valid aliasing (cv qualified type).
+*reinterpret_cast<const int*>(&i);
+*reinterpret_cast<const unsigned*>(&i);
+
+// Valid aliasing (byte type).
+*reinterpret_cast<char*>(&i);
+*reinterpret_cast<std::byte*>(&i);
+
+// Invalid aliasing, dereferencing pointer is UB.
+*reinterpret_cast<short*>(&i);
+*reinterpret_cast<float*>(&i);
+
+
+

NOTE: Casting pointer to invalid aliasing type is not directly UB, but +dereferencing the pointer is UB.

+
+
short s[2] = { 1, 2 };
+
+// Invalid aliasing (UB) - type punning, UB to deref ptr (int has stricter
+// alignment requirements than short).
+*reinterpret_cast<int*>(s);
+
+
+// Arbitrary byte pointer.
+char c[4] = { 1, 2, 3, 4 };
+
+// Invalid aliasing (UB) - type punning, UB to deref ptr (int has stricter
+// alignment requirements than char).
+*reinterpret_cast<int*>(c);
+
+

At the time of writing, the current c++ std draft +contains the following.

+
If a program attempts to access the stored value of an object through a glvalue
+whose type is not **similar** (7.3.6) to one of the following types the
+behavior is undefined [44]
+
+(11.1) the dynamic type of the object,
+(11.2) a type that is the signed or unsigned type corresponding to the dynamic
+       type of the object, or
+(11.3) a char, unsigned char, or std::byte type.
+
+[44]: The intent of this list is to specify those circumstances in which an
+      object can or cannot be aliased.
+
+

The paragraph is short but one also needs to understand the meaning of +similar (similar_types).

+

This paragraph is actually somewhat more explicit in the c++17 std.

+
If a program attempts to access the stored value of an object through a glvalue
+of other than one of the following types the behavior is undefined [63]
+
+(11.1) the dynamic type of the object,
+(11.2) a cv-qualified version of the dynamic type of the object,
+(11.3) a type similar (as defined in 7.5) to the dynamic type of the object,
+(11.4) a type that is the signed or unsigned type corresponding to the dynamic
+       type of the object,
+(11.5) a type that is the signed or unsigned type corresponding to a
+       cv-qualified version of the dynamic type of the object,
+(11.6) an aggregate or union type that includes one of the aforementioned types
+       among its elements or non- static data members (including, recursively,
+       an element or non-static data member of a subaggregate or contained
+       union),
+(11.7) a type that is a (possibly cv-qualified) base class type of the dynamic
+       type of the object,
+(11.8) a char, unsigned char, or std::byte type.
+
+[63]: The intent of this list is to specify those circumstances in which an
+      object may or may not be aliased.
+
+

Additional references:

+
    +
  • +

    What is the Strict Aliasing Rule and Why do we care

    +

    The article shows a small example how the compiler may optimized using the +strict aliasing rules.

    +
    int alias(int* i, char* c) {
    +  *i = 1;
    +  *c = 'a';  // char* may alias int*
    +  return *i;
    +}
    +
    +int noalias(int* i, short* s) {
    +    *i = 1;
    +    *s = 2;  // short* does not alias int*
    +    return *i;
    +}
    +
    +
    alias(int*, char*):
    +mov    DWORD PTR [rdi] ,0x1  ; *i = 1;
    +mov    BYTE PTR [rsi], 0x61  ; *c = 'a';
    +mov    eax,DWORD PTR [rdi]   ; Must reload, char* can alias int*.
    +ret
    +
    +noalias(int*, short*):
    +mov    DWORD PTR [rdi], 0x1  ; *i = 1;
    +mov    WORD PTR [rsi], 0x2   ; *s = 2;
    +mov    eax,0x1               ; Must not reload, short* can not alias int*.
    +ret
    +
    +
  • +
  • +

    reinterpret_cast type aliasing

    +
    +
      +
    1. Any object pointer type T1* can be converted to another object pointer +type cv T2*. This is exactly equivalent to static_cast<cv T2*>(static_cast<cv void*>(expression)) (which implies that if T2's +alignment requirement is not stricter than T1's, the value of the pointer +does not change and conversion of the resulting pointer back to its +original type yields the original value). In any case, the resulting +pointer may only be dereferenced safely if allowed by the type aliasing +rules (see below).
    2. +
    +
    +
    int I;
    +char* X = reinterpret_cast<char*>(&I);  // Valid, char allowed to alias int.
    +*X = 42;
    +int* Y = reinterpret_cast<int*>(X);     // Cast back to original type.
    +*Y = 1337;  // safe
    +
    +char C[4];
    +int* P = reinterpret_cast<int*>(C);     // Cast is ok, not yet UB.
    +*P = 1337; // UB, violates strict aliasing / alignment rules.
    +           // https://stackoverflow.com/questions/52492229/c-byte-array-to-int
    +
    +
  • +
  • +

    On gcc strict aliasing is enabled starting with -O2.

    +
    for i in {0..3} g s; do echo "-O$i $(g++ -Q --help=optimizers -O$i | grep fstrict-aliasing)"; done
    +-O0   -fstrict-aliasing           [disabled]
    +-O1   -fstrict-aliasing           [disabled]
    +-O2   -fstrict-aliasing           [enabled]
    +-O3   -fstrict-aliasing           [enabled]
    +-Og   -fstrict-aliasing           [disabled]
    +-Os   -fstrict-aliasing           [enabled]
    +
    +
  • +
+

__restrict keyword

+

The __restrict keyword allows the programmer to tell the compiler that two +pointer will not alias each other.

+
int alias(int* a, int* b) {
+    *a = 1;
+    *b = 2;
+    return *a;
+}
+
+// alias(int*, int*):                           # @alias(int*, int*)
+//         mov     dword ptr [rdi], 1
+//         mov     dword ptr [rsi], 2
+//         mov     eax, dword ptr [rdi]
+//         ret
+
+int noalias(int* __restrict a, int* __restrict b) {
+    *a = 1;
+    *b = 2;
+    return *a;
+}
+
+// noalias(int*, int*):                         # @noalias(int*, int*)
+//         mov     dword ptr [rdi], 1
+//         mov     dword ptr [rsi], 2
+//         mov     eax, 1
+//         ret
+
+

However this should only be used with care and in a narrow scope, as it is easy +to violate self defined contract, see godbolt.

+

Type punning

+

The correct way to do type-punning in c++:

+
    +
  1. std::bit_cast (c++20)
  2. +
  3. std::memcpy
  4. +

Variadic templates (parameter pack)

#include <iostream>
 
diff --git a/development/c++filt.html b/development/c++filt.html
index 1468f2a..e7b7614 100644
--- a/development/c++filt.html
+++ b/development/c++filt.html
@@ -83,7 +83,7 @@
 
         
diff --git a/development/gcc.html b/development/gcc.html
index 4490222..c8a236a 100644
--- a/development/gcc.html
+++ b/development/gcc.html
@@ -83,7 +83,7 @@
 
         
diff --git a/development/gcov.html b/development/gcov.html
index e065a03..6103b8d 100644
--- a/development/gcov.html
+++ b/development/gcov.html
@@ -83,7 +83,7 @@
 
         
@@ -265,7 +265,7 @@ clean:
                                 
                             
 
-                            
 
@@ -279,7 +279,7 @@ clean:
                         
                     
 
-                    
             
diff --git a/development/glibc.html b/development/glibc.html
index 7961bf3..2807110 100644
--- a/development/glibc.html
+++ b/development/glibc.html
@@ -83,7 +83,7 @@
 
         
diff --git a/development/index.html b/development/index.html
index edd1fef..696e124 100644
--- a/development/index.html
+++ b/development/index.html
@@ -83,7 +83,7 @@
 
         
@@ -180,6 +180,7 @@
 
  • symbol versioning
  • python
  • gcov
  • +
  • pgo
  • diff --git a/development/ld.so.html b/development/ld.so.html index 8a88d42..110cc1c 100644 --- a/development/ld.so.html +++ b/development/ld.so.html @@ -83,7 +83,7 @@ diff --git a/development/make.html b/development/make.html index 630715d..36d174a 100644 --- a/development/make.html +++ b/development/make.html @@ -83,7 +83,7 @@ diff --git a/development/pgo.html b/development/pgo.html new file mode 100644 index 0000000..b928df6 --- /dev/null +++ b/development/pgo.html @@ -0,0 +1,363 @@ + + + + + + pgo - Notes + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + + + + + + + + + + + + + +
    + +
    + + + + + + + + +
    +
    +

    Profile guided optimization (pgo)

    +

    pgo is an optimization technique to optimize a program for its usual +workload.

    +

    It is applied in two phases:

    +
      +
    1. Collect profiling data (best with representative benchmarks).
    2. +
    3. Optimize program based on collected profiling data.
    4. +
    +

    The following simple program is used as demonstrator.

    +
    #include <stdio.h>
    +
    +#define NOINLINE __attribute__((noinline))
    +
    +NOINLINE void foo() { puts("foo()"); }
    +NOINLINE void bar() { puts("bar()"); }
    +
    +int main(int argc, char *argv[]) {
    +  if (argc == 2) {
    +    foo();
    +  } else {
    +    bar();
    +  }
    +}
    +
    +

    clang

    +

    On the actual machine with clang 15.0.7, the following code is generated for +the main() function.

    +
    # clang -o test test.c -O3
    +
    +0000000000001160 <main>:
    +    1160:  50                   push   rax
    +    ; Jump if argc != 2.
    +    1161:  83 ff 02             cmp    edi,0x2
    +    1164:  75 09                jne    116f <main+0xf>
    +    ; foor() is on the hot path (fall-through).
    +    1166:  e8 d5 ff ff ff       call   1140 <_Z3foov>
    +    116b:  31 c0                xor    eax,eax
    +    116d:  59                   pop    rcx
    +    116e:  c3                   ret
    +    ; bar() is on the cold path (branch).
    +    116f:  e8 dc ff ff ff       call   1150 <_Z3barv>
    +    1174:  31 c0                xor    eax,eax
    +    1176:  59                   pop    rcx
    +    1177:  c3                   ret
    +
    +

    The following shows how to compile with profiling instrumentation and how to +optimize the final program with the collected profiling data (llvm +pgo).

    +

    The arguments to ./test are chosen such that 9/10 runs call bar(), which +is currently on the cold path.

    +
    # Compile test program with profiling instrumentation.
    +clang -o test test.cc -O3 -fprofile-instr-generate
    +
    +# Collect profiling data from multiple runs.
    +for i in {0..10}; do
    +    LLVM_PROFILE_FILE="prof.clang/%p.profraw" ./test $(seq 0 $i)
    +done
    +
    +# Merge raw profiling data into single profile data.
    +llvm-profdata merge -o pgo.profdata prof.clang/*.profraw
    +
    +# Optimize test program with profiling data.
    +clang -o test test.cc -O3 -fprofile-use=pgo.profdata
    +
    +
    +

    NOTE: If LLVM_PROFILE_FILE is not given the profile data is written to +default.profraw which is re-written on each run. If the LLVM_PROFILE_FILE +contains a %m in the filename, a unique integer will be generated and +consecutive runs will update the same generated profraw file, +LLVM_PROFILE_FILE can specify a new file every time, however that requires +more storage in general.

    +
    +

    After optimizing the program with the profiling data, the main() function +looks as follows.

    +
    0000000000001060 <main>:
    +    1060:  50                    push   rax
    +    ; Jump if argc == 2.
    +    1061:  83 ff 02              cmp    edi,0x2
    +    1064:  74 09                 je     106f <main+0xf>
    +    ; bar() is on the hot path (fall-through).
    +    1066:  e8 e5 ff ff ff        call   1050 <_Z3barv>
    +    106b:  31 c0                 xor    eax,eax
    +    106d:  59                    pop    rcx
    +    106e:  c3                    ret
    +    ; foo() is on the cold path (branch).
    +    106f:  e8 cc ff ff ff        call   1040 <_Z3foov>
    +    1074:  31 c0                 xor    eax,eax
    +    1076:  59                    pop    rcx
    +    1077:  c3                    ret
    +
    +

    gcc

    +

    With gcc 13.2.1 on the current machine, the optimizer puts bar() on the +hot path by default.

    +
    0000000000001040 <main>:
    +    1040:  48 83 ec 08          sub    rsp,0x8
    +    ; Jump if argc == 2.
    +    1044:  83 ff 02             cmp    edi,0x2
    +    1047:  74 0c                je     1055 <main+0x15>
    +    ; bar () is on the hot path (fall-through).
    +    1049:  e8 22 01 00 00       call   1170 <_Z3barv>
    +    104e:  31 c0                xor    eax,eax
    +    1050:  48 83 c4 08          add    rsp,0x8
    +    1054:  c3                   ret
    +    ; foo() is on the cold path (branch).
    +    1055:  e8 06 01 00 00       call   1160 <_Z3foov>
    +    105a:  eb f2                jmp    104e <main+0xe>
    +    105c:  0f 1f 40 00          nop    DWORD PTR [rax+0x0]
    +
    +
    +

    The following shows how to compile with profiling instrumentation and how to +optimize the final program with the collected profiling data.

    +

    The arguments to ./test are chosen such that 2/3 runs call foo(), which +is currently on the cold path.

    +
    gcc -o test test.cc -O3 -fprofile-generate
    +./test 1
    +./test 1
    +./test 2 2
    +gcc -o test test.cc -O3 -fprofile-use
    +
    +
    +

    NOTE: Consecutive runs update the generated test.gcda profile data file +rather than re-write it.

    +
    +

    After optimizing the program with the profiling data, the main() function

    +
    0000000000001040 <main.cold>:
    +    ; bar() is on the cold path (branch).
    +    1040:  e8 05 00 00 00       call   104a <_Z3barv>
    +    1045:  e9 25 00 00 00       jmp    106f <main+0xf>
    +
    +0000000000001060 <main>:
    +    1060:  51                   push   rcx
    +    ; Jump if argc != 2.
    +    1061:  83 ff 02             cmp    edi,0x2
    +    1064:  0f 85 d6 ff ff ff    jne    1040 <main.cold>
    +    ; for() is on the hot path (fall-through).
    +    106a:  e8 11 01 00 00       call   1180 <_Z3foov>
    +    106f:  31 c0                xor    eax,eax
    +    1071:  5a                   pop    rdx
    +    1072:  c3                   ret
    +
    + +
    + + +
    +
    + + + +
    + + + + + + + + + + + + + + + + + + +
    + + diff --git a/development/python.html b/development/python.html index 0362177..cb9d6af 100644 --- a/development/python.html +++ b/development/python.html @@ -83,7 +83,7 @@ diff --git a/development/symbolver.html b/development/symbolver.html index a968cbd..1b82a41 100644 --- a/development/symbolver.html +++ b/development/symbolver.html @@ -83,7 +83,7 @@ @@ -372,6 +372,7 @@ func_v2
  • Binutils ld: Symbol Versioning
  • LSB: Symbol Versioning
  • How To Write Shared Libraries
  • +
  • LSB: ELF File Format
  • -- cgit v1.2.3