Better performance of Signal decoding

Issue #10 new
WorkerH created an issue

Hey,

I wrote some code (untested) to speed up the signal decoding function:

uint64_t Signal::decode(std::vector<uint8_t> & data)
{
    uint64_t mask;
    switch (bitSize)
    {
        case 0: mask = 0ull; break;
        case 1: mask = 1ull; break;
        case 2: mask = 3ull; break;
        case 3: mask = 7ull; break;
        case 4: mask = 15ull; break;
        case 5: mask = 31ull; break;
        case 6: mask = 63ull; break;
        case 7: mask = 127ull; break;
        case 8: mask = 255ull; break;
        case 9: mask = 511ull; break;
        case 10: mask = 1023ull; break;
        case 11: mask = 2047ull; break;
        case 12: mask = 4095ull; break;
        case 13: mask = 8191ull; break;
        case 14: mask = 16383ull; break;
        case 15: mask = 32767ull; break;
        case 16: mask = 65535ull; break;
        case 17: mask = 131071ull; break;
        case 18: mask = 262143ull; break;
        case 19: mask = 524287ull; break;
        case 20: mask = 1048575ull; break;
        case 21: mask = 2097151ull; break;
        case 22: mask = 4194303ull; break;
        case 23: mask = 8388607ull; break;
        case 24: mask = 16777215ull; break;
        case 25: mask = 33554431ull; break;
        case 26: mask = 67108863ull; break;
        case 27: mask = 134217727ull; break;
        case 28: mask = 268435455ull; break;
        case 29: mask = 536870911ull; break;
        case 30: mask = 1073741823ull; break;
        case 31: mask = 2147483647ull; break;
        case 32: mask = 4294967295ull; break;
        case 33: mask = 8589934591ull; break;
        case 34: mask = 17179869183ull; break;
        case 35: mask = 34359738367ull; break;
        case 36: mask = 68719476735ull; break;
        case 37: mask = 137438953471ull; break;
        case 38: mask = 274877906943ull; break;
        case 39: mask = 549755813887ull; break;
        case 40: mask = 1099511627775ull; break;
        case 41: mask = 2199023255551ull; break;
        case 42: mask = 4398046511103ull; break;
        case 43: mask = 8796093022207ull; break;
        case 44: mask = 17592186044415ull; break;
        case 45: mask = 35184372088831ull; break;
        case 46: mask = 70368744177663ull; break;
        case 47: mask = 140737488355327ull; break;
        case 48: mask = 281474976710655ull; break;
        case 49: mask = 562949953421311ull; break;
        case 50: mask = 1125899906842623ull; break;
        case 51: mask = 2251799813685247ull; break;
        case 52: mask = 4503599627370495ull; break;
        case 53: mask = 9007199254740991ull; break;
        case 54: mask = 18014398509481983ull; break;
        case 55: mask = 36028797018963967ull; break;
        case 56: mask = 72057594037927935ull; break;
        case 57: mask = 144115188075855871ull; break;
        case 58: mask = 288230376151711743ull; break;
        case 59: mask = 576460752303423487ull; break;
        case 60: mask = 1152921504606846975ull; break;
        case 61: mask = 2305843009213693951ull; break;
        case 62: mask = 4611686018427387903ull; break;
        case 63: mask = 9223372036854775807ull; break;
        case 64: mask = 18446744073709551615ull; break;
    }
    uint64_t buffer = 0;
    if (byteOrder == ByteOrder::BigEndian)
    {
        uint64_t p = *reinterpret_cast<uint64_t*>(&vdata[0]);
        switch (bitSize / 8 + (bitSize % 8 == 0 ? 0 : 1))
        {
        case 0: case 1: buffer = *p; break;
        case 2: buffer = (p & ~mask) | (vdata[0] << 8 | vdata[1]); break;
        case 3: buffer = (p & ~mask) | (vdata[0] << 16) | (vdata[1] << 8) | vdata[2]; break;
        case 4: buffer = (p & ~mask) | (vdata[0] << 24) | (vdata[1] << 16) | (vdata[2] << 8) | vdata[3]; break;
        case 5: buffer = (p & ~mask) | (vdata[0] << 32) | (vdata[1] << 24) | (vdata[2] << 16) | (vdata[3] << 8) | vdata[4]; break;
        case 6: buffer = (p & ~mask) | (vdata[0] << 40) | (vdata[1] << 32) | (vdata[2] << 24) | (vdata[3] << 16) | (vdata[4] << 8) | vdata[5]; break;
        case 7: buffer = (p & ~mask) | (vdata[0] << 48) | (vdata[1] << 40) | (vdata[2] << 32) | (vdata[3] << 24) | (vdata[4] << 16) | (vdata[5] << 8) | vdata[6]; break;
        case 8: buffer = (p & ~mask) | (vdata[0] << 56) | (vdata[1] << 48) | (vdata[2] << 40) | (vdata[3] << 32) | (vdata[4] << 24) | (vdata[5] << 16) | (vdata[6] << 8) | vdata[7]; break;
        }
    }
    else
    {
        buffer = *reinterpret_cast<uint64_t*>(&vdata[0]);
    }
    uint64_t result = (buffer >> startBit) & mask;
    if (signed)
    {
        result |= ~mask;
    }
    return result;
}

This should run much faster than the old code. I also guess there is somre more improvement potential in my code.

Comments (22)

  1. Tobias Lorenz repo owner

    Hi Julian,

    that looks great. I'll integrate it in the next days. I also need to extend the unit tests to have 100% coverage again. This is also related to #4.

    Bye Tobias

  2. WorkerH reporter

    I had an even better idea. Since a decoding functions depends on the bit len, endianess and signedness (256 possible combinations), it would be possible to pre generate the decode functions while parsing the dbc. Doing this would make the code as fast as code generated code. Here is a schematic example of what I mean:

    template <uint64_t BitLen, ByteOrder Endianess, bool Signed>
    class VECTOR_DBC_EXPORT NewSignal
        : public Signal
    {
    public:
        virtual uint64_t decode(std::basic_string_view<uint8_t> data) override
        {
            uint64_t mask = 0;
            if constexpr (BitLen = 0) mask = 0ull;
            else if constexpr (BitLen == 1) mask = 1ull;
            else if constexpr (BitLen == 2) mask = 3ull;
            else if constexpr (BitLen == 3) mask = 7ull;
            else if constexpr (BitLen == 4) mask = 15ull;
            else if constexpr (BitLen == 5) mask = 31ull;
            else if constexpr (BitLen == 6) mask = 63ull;
            else if constexpr (BitLen == 7) mask = 127ull;
            else if constexpr (BitLen == 8) mask = 255ull;
            else if constexpr (BitLen == 9) mask = 511ull;
            else if constexpr (BitLen == 10) mask = 1023ull;
            else if constexpr (BitLen == 11) mask = 2047ull;
            else if constexpr (BitLen == 12) mask = 4095ull;
            else if constexpr (BitLen == 13) mask = 8191ull;
            else if constexpr (BitLen == 14) mask = 16383ull;
            else if constexpr (BitLen == 15) mask = 32767ull;
            else if constexpr (BitLen == 16) mask = 65535ull;
            else if constexpr (BitLen == 17) mask = 131071ull;
            else if constexpr (BitLen == 18) mask = 262143ull;
            else if constexpr (BitLen == 19) mask = 524287ull;
            else if constexpr (BitLen == 20) mask = 1048575ull;
            else if constexpr (BitLen == 21) mask = 2097151ull;
            else if constexpr (BitLen == 22) mask = 4194303ull;
            else if constexpr (BitLen == 23) mask = 8388607ull;
            else if constexpr (BitLen == 24) mask = 16777215ull;
            else if constexpr (BitLen == 25) mask = 33554431ull;
            else if constexpr (BitLen == 26) mask = 67108863ull;
            else if constexpr (BitLen == 27) mask = 134217727ull;
            else if constexpr (BitLen == 28) mask = 268435455ull;
            else if constexpr (BitLen == 29) mask = 536870911ull;
            else if constexpr (BitLen == 30) mask = 1073741823ull;
            else if constexpr (BitLen == 31) mask = 2147483647ull;
            else if constexpr (BitLen == 32) mask = 4294967295ull;
            else if constexpr (BitLen == 33) mask = 8589934591ull;
            else if constexpr (BitLen == 34) mask = 17179869183ull;
            else if constexpr (BitLen == 35) mask = 34359738367ull;
            else if constexpr (BitLen == 36) mask = 68719476735ull;
            else if constexpr (BitLen == 37) mask = 137438953471ull;
            else if constexpr (BitLen == 38) mask = 274877906943ull;
            else if constexpr (BitLen == 39) mask = 549755813887ull;
            else if constexpr (BitLen == 40) mask = 1099511627775ull;
            else if constexpr (BitLen == 41) mask = 2199023255551ull;
            else if constexpr (BitLen == 42) mask = 4398046511103ull;
            else if constexpr (BitLen == 43) mask = 8796093022207ull;
            else if constexpr (BitLen == 44) mask = 17592186044415ull;
            else if constexpr (BitLen == 45) mask = 35184372088831ull;
            else if constexpr (BitLen == 46) mask = 70368744177663ull;
            else if constexpr (BitLen == 47) mask = 140737488355327ull;
            else if constexpr (BitLen == 48) mask = 281474976710655ull;
            else if constexpr (BitLen == 49) mask = 562949953421311ull;
            else if constexpr (BitLen == 50) mask = 1125899906842623ull;
            else if constexpr (BitLen == 51) mask = 2251799813685247ull;
            else if constexpr (BitLen == 52) mask = 4503599627370495ull;
            else if constexpr (BitLen == 53) mask = 9007199254740991ull;
            else if constexpr (BitLen == 54) mask = 18014398509481983ull;
            else if constexpr (BitLen == 55) mask = 36028797018963967ull;
            else if constexpr (BitLen == 56) mask = 72057594037927935ull;
            else if constexpr (BitLen == 57) mask = 144115188075855871ull;
            else if constexpr (BitLen == 58) mask = 288230376151711743ull;
            else if constexpr (BitLen == 59) mask = 576460752303423487ull;
            else if constexpr (BitLen == 60) mask = 1152921504606846975ull;
            else if constexpr (BitLen == 61) mask = 2305843009213693951ull;
            else if constexpr (BitLen == 62) mask = 4611686018427387903ull;
            else if constexpr (BitLen == 63) mask = 9223372036854775807ull;
            else if constexpr (BitLen == 64) mask = 18446744073709551615ull;
            uint64_t buffer = 0;
            if constexpr (Endianess == ByteOrder::BigEndian)
            {
                uint64_t p = *reinterpret_cast<const uint64_t*>(&data[0]);
                constexpr uint32_t ByteLen = (BitLen / 8) + (BitLen % 8 == 0 ? 0 : 1);
                if constexpr (ByteLen == 0) || ByteLen == 1) buffer = p;
                else if constexpr (ByteLen == 2) buffer = (p & ~mask) | ((uint64_t)data[0] << 8 | (uint64_t)data[1]);
                else if constexpr (ByteLen == 3) buffer = (p & ~mask) | ((uint64_t)data[0] << 16ull) | ((uint64_t)data[1] << 8ull) | (uint64_t)data[2];
                else if constexpr (ByteLen == 4) buffer = (p & ~mask) | ((uint64_t)data[0] << 24ull) | ((uint64_t)data[1] << 16ull) | ((uint64_t)data[2] << 8ull) | (uint64_t)data[3];
                else if constexpr (ByteLen == 5) buffer = (p & ~mask) | ((uint64_t)data[0] << 32ull) | ((uint64_t)data[1] << 24ull) | ((uint64_t)data[2] << 16ull) | ((uint64_t)data[3] << 8ull) | (uint64_t)data[4];
                else if constexpr (ByteLen == 6) buffer = (p & ~mask) | ((uint64_t)data[0] << 40ull) | ((uint64_t)data[1] << 32ull) | ((uint64_t)data[2] << 24ull) | ((uint64_t)data[3] << 16ull) | ((uint64_t)data[4] << 8ull) | (uint64_t)data[5];
                else if constexpr (ByteLen == 7) buffer = (p & ~mask) | ((uint64_t)data[0] << 48ull) | ((uint64_t)data[1] << 40ull) | ((uint64_t)data[2] << 32ull) | ((uint64_t)data[3] << 24ull) | ((uint64_t)data[4] << 16ull) | ((uint64_t)data[5] << 8ull) | (uint64_t)data[6];
                else if constexpr (ByteLen == 8) buffer = (p & ~mask) | ((uint64_t)data[0] << 56ull) | ((uint64_t)data[1] << 48ull) | ((uint64_t)data[2] << 40ull) | ((uint64_t)data[3] << 32ull) | ((uint64_t)data[4] << 24ull) | ((uint64_t)data[5] << 16ull) | ((uint64_t)data[6] << 8ull) | (uint64_t)data[7];
            }
            else
            {
                buffer = *reinterpret_cast<const uint64_t*>(&data[0]);
            }
            uint64_t result = (buffer >> startBit) & mask;
            if constexpr (valueType == ValueType::Signed)
            {
                if ((result >> startBit) >> (bitSize - 1) & 1)
                {
                    result |= ~mask;
                }
            }
            return result;
        }
    };
    

    With this piece of code, you have to check at dbc parsing time for all the possible combinations a signal can be (whether it's signed, bit len, endianess), then create the corresponding NewSignal object, e.g. BitLen = 51, Endianess = LittleEndian, Signed = false:

    std::unique_ptr<Signal> signal = std::make_unique<NewSignal<51, ByteOrder::BigEndian, false>>();
    signal->decode({frame.data, frame.dlc}); // decode as fast as code generated decoding would
    

    This would sum up to a if with a total of 256 paths (BitLen* Endianess * Signed = 64 * 2 * 2 (you have to write this big boy if by hand)).

    But then this code would run as fast as a code generated would.

    If you want avoid C++17 code, it would be possible to reach the same with SFINAE (std::enable_if instead of constexpr).

  3. WorkerH reporter

    What do you think about it?

    I figured out the bison files have to be changed for this code, so that bison generates the if I was talking about. I have no experience with bison, do you think this is possible?

    If it's possible, maybe I find some time to implement this.

  4. WorkerH reporter

    I started to adapt the bison file (I'm pretty sure now, that everything is possible, with some adaptions in the bison file but only few in the existin C++ code of your library),

    I think I will make a pull request these days.

  5. WorkerH reporter

    Instead of the big switch for the mask a simple calculation can be used:

    constexpr uint64_t mask = (1 << BitLen) - 1;
    
  6. Tobias Lorenz repo owner

    Hi Julian, yes, I also had a similar optimization in mind. So currently the method is copying bit-per-bit. Instead I could also copy larger blocks and shift them around. But I like the idea of having a template for this. It makes the code in RAM larger, but it would significantly increase the signal decoding times. I'm not sure how flex/bison can help here, as it mainly generates the parser code to setup the classes according to file content. It cannot change the code, but just the data. Bye Tobias

  7. WorkerH reporter

    Hey,

    the problem I see in your code is, there are many ifs the CPU have to make branch predictions for. So though your new code reduces the number of iterations, your code still have the same amount of ifs, so the number of branch predictions stays the same. Whereby this is what I think will slow down the decoding function, but to be really sure you have to measure time.

    In my idea I tryed to reduce the amount of branches by precompiling each possible combination a Signal can have in view of the BitLen, Endianess, Signedness, and then use dynamic dispatching instead of a big switch to execute the correct decode function. I also figured out I can use Bison to achieve this relativlely well... I think I will program a proposal too and then we may compare both to find the best one. Greets Julian

  8. WorkerH reporter

    I also think it's okay to exploit the endianess of the System, so instead of your while you could use this:

    /* deprecated:
    template <uint64_t SIZE>
    inline uint64_t reverse_bytes(uint64_t sb, uint64_t val);
    template <>
    inline uint64_t reverse_bytes<2>(uint64_t sb, uint64_t val)
    {
        val >>= sb * 8;
        return (((val & 0xFF) << 8ull) |
            ((val & 0xFF00) >> 8ull)) << (sb * 8);
    }
    template <>
    inline uint64_t reverse_bytes<3>(uint64_t sb, uint64_t val)
    {
        val >>= sb * 8;
        return (((val & 0xFF) << 16ull) |
            (val & 0xFF00) |
            ((val & 0xFF0000) >> 16ull)) << (sb * 8);
    }
    template <>
    inline uint64_t reverse_bytes<4>(uint64_t sb, uint64_t val)
    {
        val >>= sb * 8;
        return (((val & 0xFF) << 24ull) |
            ((val & 0xFF00) << 8ull) |
            ((val & 0xFF0000) >> 8ull) |
            ((val & 0xFF000000) >> 24ul)) << (sb * 8);
    }
    template <>
    inline uint64_t reverse_bytes<5>(uint64_t sb, uint64_t val)
    {
        val >>= sb * 8;
        uint64_t ret = val & (mask(40) << (sb * 8));
        return (((val & 0xFF) << 32ull) |
            ((val & 0xFF00) << 16ull) |
            ((val & 0xFF0000)) |
            ((val & 0xFF000000) >> 16ull) |
            ((val & 0xFF00000000) >> 32ul)) << (sb * 8);
    }
    template <>
    inline uint64_t reverse_bytes<6>(uint64_t sb, uint64_t val)
    {
        val >>= sb * 8;
        return (((val & 0xFF) << 40ull) |
            ((val & 0xFF00) << 24ull) |
            ((val & 0xFF0000) << 8ull) |
            ((val & 0xFF000000) >> 8ull) |
            ((val & 0xFF00000000) >> 24ull) |
            ((val & 0xFF0000000000) >> 40ul)) << (sb * 8);
    }
    template <>
    inline uint64_t reverse_bytes<7>(uint64_t sb, uint64_t val)
    {
        val >>= sb * 8;
        return (((val & 0xFF) << 48ull) |
            ((val & 0xFF00) << 32ull) |
            ((val & 0xFF0000) << 16ull) |
            ((val & 0xFF000000)) |
            ((val & 0xFF00000000) >> 16ull) |
            ((val & 0xFF0000000000) >> 32ul) |
            ((val & 0xFF000000000000) >> 48ul)) << (sb * 8);
    }
    template <>
    inline uint64_t reverse_bytes<8>(uint64_t sb, uint64_t val)
    {
        val >>= sb * 8;
        return (((val & 0xFF) << 56ull) |
            ((val & 0xFF00) << 40ull) |
            ((val & 0xFF0000) << 24ull) |
            ((val & 0xFF000000) << 8ull) |
            ((val & 0xFF00000000) >> 8ull) |
            ((val & 0xFF0000000000) >> 24ul) |
            ((val & 0xFF000000000000) >> 40ul) |
            ((val & 0xFF00000000000000) >> 56ul)) << (sb * 8);
    }
    */
    
    // m is the pivot point, beginning with 0. Pass 0 to use byte 0 as pivot point, pass 1 to use the point between byte 0 and byte 1 as pivot point, and so on till 14.
    // e.g.:
    // for (int i = 0; i < 15; i++)
    // {
    //     std::cout << std::hex << "0x" << reverse(i, val) << std::endl;
    // }
    // produces:
    // 0x11
    // 0x1122
    // 0x112233
    // 0x11223344
    // 0x1122334455
    // 0x112233445566
    // 0x11223344556677
    // 0x1122334455667788
    // 0x2233445566778800
    // 0x3344556677880000
    // 0x4455667788000000
    // 0x5566778800000000
    // 0x6677880000000000
    // 0x7788000000000000
    // 0x8800000000000000
    uint64_t reverse(uint64_t m, uint64_t val)
    {
        uint64_t o = m % 2;
        o <<= 3; // fast * 8
        m >>= 1; // fast / 2
        m <<= 3; // fast * 8
        return  ((val & (0xFFull << ((m - 24)))) << ((48 + o))) |
                ((val & (0xFFull << ((m - 16)))) << ((32 + o))) |
                ((val & (0xFFull << ((m - 8)))) << ((16 + o))) |
                ((val & (0xFFull << ((m - 0)))) << ((0 + o))) |
                ((val & (0xFFull << ((m + o + 0)))) >> ((0 + o))) |
                ((val & (0xFFull << ((m + o + 8)))) >> ((16 + o))) |
                ((val & (0xFFull << ((m + o + 16)))) >> ((32 + o))) |
                ((val & (0xFFull << ((m + o + 24)))) >> ((48 + o)));
    }
    #ifdef BOOST_LITTLE_ENDIAN
    #define LITTLE_ENDIAN true
    #else
    #define LITTLE_ENDIAN false
    #endif
    uint64_t d = 0;
    if constexpr (LITTLE_ENDIAN)
    {
        if (byteOrder == ByteOrder::LittleEndian)
        {
            d = *reinterpret_cast<uint64_t*>(&data[0]); // or use std::string_view's function data(): data.data() instead of &data[0]
        }
        else
        {
            uint64_t sb = startBit / 8;
            uint64_t uid = *reinterpret_cast<uint64_t*>(&data[0]);
            uint64_t byte_size = (bitSize + 7 + (startBit % 8)) / 8;
            uint64_t off = ((bitSize + startBit % 8) / 8);
            d= reverse((startBit / 8, uid);
            /*
            switch (byte_size)
            {
            case 0: case 1: d = data[sb] << (sb * 8); break;
            case 2: d = reverse_bytes<2>(sb - off , uid); break;
            case 3: d = reverse_bytes<3>(sb - off , uid); break;
            case 4: d = reverse_bytes<4>(sb - off , uid); break;
            case 5: d = reverse_bytes<5>(sb - off , uid); break;
            case 6: d = reverse_bytes<6>(sb - off , uid); break;
            case 7: d = reverse_bytes<7>(sb - off , uid); break;
            case 8: d = reverse_bytes<8>(sb - off , uid); break;
            }
            */
            startBit -= (byte_size - 1) * 8;
        }
    }
    else
    {
        if (byteOrder == ByteOrder::BigEndian)
        {
            // TODO: the same like ByteOrder::BigEndian on BOOST_LITTLE_ENDIAN maschine just opposite
        }
        else
        {
            d = *reinterpret_cast<uint64_t*>(&data[0]);
        }
    }
    

    And then just use the processors bit operations:

    uint64_t mask = (1 << bitSize) - 1;
    d = (d >> startBit) & mask;
    if (signed)
    {
        d |= ~(((d & (1ull << (bitSize - 1))) << 1) - 1);
    }
    return d;
    

    To significantly reduce the number of branches in your code, because you don't need the while anymore.

    Whereby you should be able to use cmake for the endianess detection, so you don't need to link boost only to get the systems endianess. E.g. you could take a look here: https://cmake.org/cmake/help/v3.0/module/TestBigEndian.html

    Or do you have any concerns using this? Do I have any thinking errors?

    EDIT: While debugging my code I figured out, your code produces something different for the input:

    uint64_t startBit = 37;
    uint64_t bitSize = 20;
    unsigned char data[] = { 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88 };
    

    Using the bigendian algorithm of your new decoding function on a x64 machine, than I get with pen & paper (I calculated 0x19A22 by hand). Could you veryfy this? If your algorithm is correct mine is wrong.

  9. WorkerH reporter

    Okay, I commited a proposal now: https://bitbucket.org/JulianH93/vector_dbc/src/proposal_fast_decode/

    I also made some measurements: First test:

    #include <iostream>
    #include "Vector/DBC.h"
    #include <functional>
    #include <ctime>
    #include <chrono>
    #include <random>
    template<typename TimeT = std::chrono::milliseconds>
    struct measure
    {
        template<typename F, typename ...Args>
        static typename TimeT::rep execution(F&& func, Args&&... args)
        {
            auto start = std::chrono::steady_clock::now();
            std::forward<decltype(func)>(func)(std::forward<Args>(args)...);
            auto duration = std::chrono::duration_cast< TimeT> 
                (std::chrono::steady_clock::now() - start);
            return duration.count();
        }
    };
    void repeat_n_times(std::size_t n, std::function<void()> func)
    {
        for (std::size_t i = 0; i < n; i++)
        {
            func();
        }
    }
    int main()
    {
        Vector::DBC::Signal signal;
        signal.bitSize = 10;
        signal.startBit = 20;
        signal.byteOrder = Vector::DBC::ByteOrder::LittleEndian;
        signal.valueType = Vector::DBC::ValueType::Signed;
        {
            uint64_t data = 0x1122334455667788;
            auto old_time = measure<>::execution(
                repeat_n_times,
                999999999,
                [&data, &signal]()
                {
                    volatile uint64_t raw = signal.decode({(uint8_t*)&data, 8});
                });
            std::cout << "decode: " << old_time << std::endl;
        }
        {
            uint64_t data = 0x1122334455667788;
            auto my_time = measure<>::execution(
                repeat_n_times,
                999999999,
                [&data, &signal]()
                {
                    volatile uint64_t raw = signal.my_decode({(uint8_t*)&data, 8});
                });
            std::cout << "my_decode: " << my_time << std::endl;
        }
        {
            uint64_t data = 0x1122334455667788;
            auto lor_time = measure<>::execution(
                repeat_n_times,
                999999999,
                [&data, &signal]()
                {
                    volatile uint64_t raw = signal.lor_decode({(uint8_t*)&data, 8});
                });
            std::cout << "lor_decode: " << lor_time << std::endl;
        }
    }
    
    decode: 17228
    my_decode: 11101
    lor_decode: 17386
    

    My code runs ~60% faster than the old code, with no big difference between your old method and your new method. I couldn't do tests for BigEndian with your code, because it seems like your code is buggy and doesn't terminate for BigEndian (because I got out of bound errors while testing), but my code needs ~50% more time for BigEndian than LittleEndian, so my code should still be faster than your method.

    Second test:

    #include <iostream>
    #include "Vector/DBC.h"
    #include <functional>
    #include <ctime>
    #include <chrono>
    #include <random>
    template<typename TimeT = std::chrono::milliseconds>
    struct measure
    {
        template<typename F, typename ...Args>
        static typename TimeT::rep execution(F&& func, Args&&... args)
        {
            auto start = std::chrono::steady_clock::now();
            std::forward<decltype(func)>(func)(std::forward<Args>(args)...);
            auto duration = std::chrono::duration_cast< TimeT> 
                (std::chrono::steady_clock::now() - start);
            return duration.count();
        }
    };
    void repeat_n_times(std::size_t n, std::function<void()> func)
    {
        for (std::size_t i = 0; i < n; i++)
        {
            func();
        }
    }
    int main()
    {
        Vector::DBC::Signal signal;
        signal.bitSize = 10;
        signal.startBit = 20;
        signal.byteOrder = Vector::DBC::ByteOrder::LittleEndian;
        signal.valueType = Vector::DBC::ValueType::Signed;
        std::random_device rd;
        auto s = rd();
        std::uniform_int_distribution<uint64_t> dis(0, uint64_t(-1));
        {
            std::mt19937 gen{s};
            auto old_time = measure<>::execution(
                repeat_n_times,
                99999999,
                [&gen, &dis, &signal]()
                {
                    uint64_t val = dis(gen);
                    volatile uint64_t raw = signal.decode({(uint8_t*)&val, 8});
                });
            std::cout << "decode: " << old_time << std::endl;
        }
        {
            std::mt19937 gen{s};
            auto my_time = measure<>::execution(
                repeat_n_times,
                99999999,
                [&gen, &dis, &signal]()
                {
                    uint64_t val = dis(gen);
                    volatile uint64_t raw = signal.my_decode({(uint8_t*)& val, 8});
                });
            std::cout << "my_decode: " << my_time << std::endl;
        }
        {
            std::mt19937 gen{s};
            auto lor_time = measure<>::execution(
                repeat_n_times,
                99999999,
                [&gen, &dis, &signal]()
                {
                    uint64_t val = dis(gen);
                    volatile uint64_t raw = signal.lor_decode({(uint8_t*)& val, 8});
                });
            std::cout << "lor_decode: " << lor_time << std::endl;
        }
    }
    

    I used a deterministic RNG. RNG because so the test makes the processor fail his branch predictions (it's also more practice like) and deterministic because so each decode funtion has to decode the same variables. With this test my code runs more than 3 times faster than your old one, but only a bit faster than your new one:

    decode: 10556
    my_decode: 3379
    lor_decode: 5720
    

    But you have to keep in mind, I only tested for bitSize == 10. With greater bitSize your algorithm will get slower and slower, while my algorithm stay the same.

    What do you think?

    PS. There should be some more potential for decoding optimization using my idea with precompiling and virtual inheritance to get rid of the last view branches, but in consideration of the less performance benefit (I expected more) I'm not sure whether it's worth the ~300Kb more code.

  10. Log in to comment