From f0cf514eb3ca30c5170e534c3861ad73996c7726 Mon Sep 17 00:00:00 2001 From: johannst Date: Tue, 22 Aug 2023 21:38:08 +0000 Subject: deploy: 9bb639287cae88b32fc1b17b7a4b494340e54434 --- print.html | 354 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 346 insertions(+), 8 deletions(-) (limited to 'print.html') diff --git a/print.html b/print.html index f09b78b..c0dea2b 100644 --- a/print.html +++ b/print.html @@ -84,7 +84,7 @@ @@ -1021,7 +1021,7 @@ list-keys -t vi-copy list keymaps for vi-copy mode git show --stat <commit> ................ show files changed by <commit> git show <commit> [<file>] .............. show diffs for <commit> - git git show <commit>:<file> ............ show <file> at <commit> + git show <commit>:<file> ................ show <file> at <commit>

Patching

  git format-patch <opt> <since>/<revision range>
@@ -2736,6 +2736,7 @@ objdump -D -b binary -m i386:x86-64 test-bin
 
  • symbol versioning
  • python
  • gcov
  • +
  • pgo
  • c++filt(1)

    Demangle symbol

    @@ -2746,6 +2747,7 @@ objdump -D -b binary -m i386:x86-64 test-bin
      readelf -W --dyn-syms <elf> | c++filt
     

    c++

    +

    openstd cpp standards.

    Source files of most examples is available here.

    Type deduction

    Force compile error to see what auto is deduced to.

    @@ -2754,6 +2756,191 @@ objdump -D -b binary -m i386:x86-64 test-bin // force compile error typename decltype(foo)::_;
    +

    Strict aliasing and type punning

    +

    The strict aliasing rules describe via which alias a value can be accessed.

    +
    +

    Informal: an alias is a reference / pointer to a value.

    +
    +

    Accessing a value through an alias that violates the strict aliasing rules is +undefined behavior (UB).

    +

    Examples below on godbolt.

    +
    int i = 0;
    +
    +// Valid aliasing (signed / unsigned type).
    +*reinterpret_cast<signed int*>(&i);
    +*reinterpret_cast<unsigned int*>(&i);
    +
    +// Valid aliasing (cv qualified type).
    +*reinterpret_cast<const int*>(&i);
    +*reinterpret_cast<const unsigned*>(&i);
    +
    +// Valid aliasing (byte type).
    +*reinterpret_cast<char*>(&i);
    +*reinterpret_cast<std::byte*>(&i);
    +
    +// Invalid aliasing, dereferencing pointer is UB.
    +*reinterpret_cast<short*>(&i);
    +*reinterpret_cast<float*>(&i);
    +
    +
    +

    NOTE: Casting pointer to invalid aliasing type is not directly UB, but +dereferencing the pointer is UB.

    +
    +
    short s[2] = { 1, 2 };
    +
    +// Invalid aliasing (UB) - type punning, UB to deref ptr (int has stricter
    +// alignment requirements than short).
    +*reinterpret_cast<int*>(s);
    +
    +
    +// Arbitrary byte pointer.
    +char c[4] = { 1, 2, 3, 4 };
    +
    +// Invalid aliasing (UB) - type punning, UB to deref ptr (int has stricter
    +// alignment requirements than char).
    +*reinterpret_cast<int*>(c);
    +
    +

    At the time of writing, the current c++ std draft +contains the following.

    +
    If a program attempts to access the stored value of an object through a glvalue
    +whose type is not **similar** (7.3.6) to one of the following types the
    +behavior is undefined [44]
    +
    +(11.1) the dynamic type of the object,
    +(11.2) a type that is the signed or unsigned type corresponding to the dynamic
    +       type of the object, or
    +(11.3) a char, unsigned char, or std::byte type.
    +
    +[44]: The intent of this list is to specify those circumstances in which an
    +      object can or cannot be aliased.
    +
    +

    The paragraph is short but one also needs to understand the meaning of +similar (similar_types).

    +

    This paragraph is actually somewhat more explicit in the c++17 std.

    +
    If a program attempts to access the stored value of an object through a glvalue
    +of other than one of the following types the behavior is undefined [63]
    +
    +(11.1) the dynamic type of the object,
    +(11.2) a cv-qualified version of the dynamic type of the object,
    +(11.3) a type similar (as defined in 7.5) to the dynamic type of the object,
    +(11.4) a type that is the signed or unsigned type corresponding to the dynamic
    +       type of the object,
    +(11.5) a type that is the signed or unsigned type corresponding to a
    +       cv-qualified version of the dynamic type of the object,
    +(11.6) an aggregate or union type that includes one of the aforementioned types
    +       among its elements or non- static data members (including, recursively,
    +       an element or non-static data member of a subaggregate or contained
    +       union),
    +(11.7) a type that is a (possibly cv-qualified) base class type of the dynamic
    +       type of the object,
    +(11.8) a char, unsigned char, or std::byte type.
    +
    +[63]: The intent of this list is to specify those circumstances in which an
    +      object may or may not be aliased.
    +
    +

    Additional references:

    + +

    __restrict keyword

    +

    The __restrict keyword allows the programmer to tell the compiler that two +pointer will not alias each other.

    +
    int alias(int* a, int* b) {
    +    *a = 1;
    +    *b = 2;
    +    return *a;
    +}
    +
    +// alias(int*, int*):                           # @alias(int*, int*)
    +//         mov     dword ptr [rdi], 1
    +//         mov     dword ptr [rsi], 2
    +//         mov     eax, dword ptr [rdi]
    +//         ret
    +
    +int noalias(int* __restrict a, int* __restrict b) {
    +    *a = 1;
    +    *b = 2;
    +    return *a;
    +}
    +
    +// noalias(int*, int*):                         # @noalias(int*, int*)
    +//         mov     dword ptr [rdi], 1
    +//         mov     dword ptr [rsi], 2
    +//         mov     eax, 1
    +//         ret
    +
    +

    However this should only be used with care and in a narrow scope, as it is easy +to violate self defined contract, see godbolt.

    +

    Type punning

    +

    The correct way to do type-punning in c++:

    +
      +
    1. std::bit_cast (c++20)
    2. +
    3. std::memcpy
    4. +

    Variadic templates (parameter pack)

    #include <iostream>
     
    @@ -3904,6 +4091,7 @@ func_v2
     
  • Binutils ld: Symbol Versioning
  • LSB: Symbol Versioning
  • How To Write Shared Libraries
  • +
  • LSB: ELF File Format
  • python

    Decorator [run]

    @@ -4085,6 +4273,146 @@ clean: 1: 16: return 0; -: 17:}
    +

    Profile guided optimization (pgo)

    +

    pgo is an optimization technique to optimize a program for its usual +workload.

    +

    It is applied in two phases:

    +
      +
    1. Collect profiling data (best with representative benchmarks).
    2. +
    3. Optimize program based on collected profiling data.
    4. +
    +

    The following simple program is used as demonstrator.

    +
    #include <stdio.h>
    +
    +#define NOINLINE __attribute__((noinline))
    +
    +NOINLINE void foo() { puts("foo()"); }
    +NOINLINE void bar() { puts("bar()"); }
    +
    +int main(int argc, char *argv[]) {
    +  if (argc == 2) {
    +    foo();
    +  } else {
    +    bar();
    +  }
    +}
    +
    +

    clang

    +

    On the actual machine with clang 15.0.7, the following code is generated for +the main() function.

    +
    # clang -o test test.c -O3
    +
    +0000000000001160 <main>:
    +    1160:  50                   push   rax
    +    ; Jump if argc != 2.
    +    1161:  83 ff 02             cmp    edi,0x2
    +    1164:  75 09                jne    116f <main+0xf>
    +    ; foor() is on the hot path (fall-through).
    +    1166:  e8 d5 ff ff ff       call   1140 <_Z3foov>
    +    116b:  31 c0                xor    eax,eax
    +    116d:  59                   pop    rcx
    +    116e:  c3                   ret
    +    ; bar() is on the cold path (branch).
    +    116f:  e8 dc ff ff ff       call   1150 <_Z3barv>
    +    1174:  31 c0                xor    eax,eax
    +    1176:  59                   pop    rcx
    +    1177:  c3                   ret
    +
    +

    The following shows how to compile with profiling instrumentation and how to +optimize the final program with the collected profiling data (llvm +pgo).

    +

    The arguments to ./test are chosen such that 9/10 runs call bar(), which +is currently on the cold path.

    +
    # Compile test program with profiling instrumentation.
    +clang -o test test.cc -O3 -fprofile-instr-generate
    +
    +# Collect profiling data from multiple runs.
    +for i in {0..10}; do
    +    LLVM_PROFILE_FILE="prof.clang/%p.profraw" ./test $(seq 0 $i)
    +done
    +
    +# Merge raw profiling data into single profile data.
    +llvm-profdata merge -o pgo.profdata prof.clang/*.profraw
    +
    +# Optimize test program with profiling data.
    +clang -o test test.cc -O3 -fprofile-use=pgo.profdata
    +
    +
    +

    NOTE: If LLVM_PROFILE_FILE is not given the profile data is written to +default.profraw which is re-written on each run. If the LLVM_PROFILE_FILE +contains a %m in the filename, a unique integer will be generated and +consecutive runs will update the same generated profraw file, +LLVM_PROFILE_FILE can specify a new file every time, however that requires +more storage in general.

    +
    +

    After optimizing the program with the profiling data, the main() function +looks as follows.

    +
    0000000000001060 <main>:
    +    1060:  50                    push   rax
    +    ; Jump if argc == 2.
    +    1061:  83 ff 02              cmp    edi,0x2
    +    1064:  74 09                 je     106f <main+0xf>
    +    ; bar() is on the hot path (fall-through).
    +    1066:  e8 e5 ff ff ff        call   1050 <_Z3barv>
    +    106b:  31 c0                 xor    eax,eax
    +    106d:  59                    pop    rcx
    +    106e:  c3                    ret
    +    ; foo() is on the cold path (branch).
    +    106f:  e8 cc ff ff ff        call   1040 <_Z3foov>
    +    1074:  31 c0                 xor    eax,eax
    +    1076:  59                    pop    rcx
    +    1077:  c3                    ret
    +
    +

    gcc

    +

    With gcc 13.2.1 on the current machine, the optimizer puts bar() on the +hot path by default.

    +
    0000000000001040 <main>:
    +    1040:  48 83 ec 08          sub    rsp,0x8
    +    ; Jump if argc == 2.
    +    1044:  83 ff 02             cmp    edi,0x2
    +    1047:  74 0c                je     1055 <main+0x15>
    +    ; bar () is on the hot path (fall-through).
    +    1049:  e8 22 01 00 00       call   1170 <_Z3barv>
    +    104e:  31 c0                xor    eax,eax
    +    1050:  48 83 c4 08          add    rsp,0x8
    +    1054:  c3                   ret
    +    ; foo() is on the cold path (branch).
    +    1055:  e8 06 01 00 00       call   1160 <_Z3foov>
    +    105a:  eb f2                jmp    104e <main+0xe>
    +    105c:  0f 1f 40 00          nop    DWORD PTR [rax+0x0]
    +
    +
    +

    The following shows how to compile with profiling instrumentation and how to +optimize the final program with the collected profiling data.

    +

    The arguments to ./test are chosen such that 2/3 runs call foo(), which +is currently on the cold path.

    +
    gcc -o test test.cc -O3 -fprofile-generate
    +./test 1
    +./test 1
    +./test 2 2
    +gcc -o test test.cc -O3 -fprofile-use
    +
    +
    +

    NOTE: Consecutive runs update the generated test.gcda profile data file +rather than re-write it.

    +
    +

    After optimizing the program with the profiling data, the main() function

    +
    0000000000001040 <main.cold>:
    +    ; bar() is on the cold path (branch).
    +    1040:  e8 05 00 00 00       call   104a <_Z3barv>
    +    1045:  e9 25 00 00 00       jmp    106f <main+0xf>
    +
    +0000000000001060 <main>:
    +    1060:  51                   push   rcx
    +    ; Jump if argc != 2.
    +    1061:  83 ff 02             cmp    edi,0x2
    +    1064:  0f 85 d6 ff ff ff    jne    1040 <main.cold>
    +    ; for() is on the hot path (fall-through).
    +    106a:  e8 11 01 00 00       call   1180 <_Z3foov>
    +    106f:  31 c0                xor    eax,eax
    +    1071:  5a                   pop    rdx
    +    1072:  c3                   ret
    +

    Linux

    arm64

    keywords: arm64, aarch64, abi

    @@ -5975,6 +6312,7 @@ required when compiling natively on riscv.

    -- cgit v1.2.3