[Bug 194635] Speed optimisation for framebuffer console driver on Raspberry Pi

Mon Dec 29 06:41:27 UTC 2014

https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=194635

--- Comment #8 from Adrian Chadd <adrian at freebsd.org> ---
Ok, so I finally got around to this!

FreeBSD-HEAD is using vt now, not syscons - I'll still merge your stuff at some
point, but your code is for the syscons console. For vt, it exposes a straight
simple mapped framebuffer to the vt code that then uses the code in
sys/dev/vt/hw/fb/ to draw things.

So, it also does mostly what you've done, and it's doing it 8, 16, or 32 bits
at a time depending upon the bpp depth.

So, I figured I'd write something that just mmap'ed /dev/fb0 into userland and
tried 8, 16 and 32 bit stores to see what's faster.

#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <time.h>
#include <sys/mman.h>
#include <sys/types.h>

#include <err.h>

//fb0: 1184x624(0x0 at 0,0) 16bpp

#define WIDTH   1184
#define HEIGHT  624
#define BPP     16

// Not true - need to know "stride".
// but treat this as if it's in bytes
#define FB_SIZE (1184*624*2)

struct timespec
ts_diff(struct timespec start, struct timespec end)
{
        struct timespec temp;

        if ((end.tv_nsec-start.tv_nsec)<0) {
                temp.tv_sec = end.tv_sec-start.tv_sec-1;
                temp.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec;
        } else {
                temp.tv_sec = end.tv_sec-start.tv_sec;
                temp.tv_nsec = end.tv_nsec-start.tv_nsec;
        }
        return temp;
}

void
fill_1byte(char *fb, char val)
{
        int i;
        for (i = 0; i < FB_SIZE; i++)
                fb[i] = val;
}

void
fill_2byte(char *fb, uint16_t val)
{
        uint16_t *f = (void *) fb;
        int i;

        for (i = 0; i < FB_SIZE / 2; i++) {
                f[i] = val;
        }
}

void
fill_4byte(char *fb, uint32_t val)
{
        uint32_t *f = (void *) fb;
        int i;

        for (i = 0; i < FB_SIZE / 4; i++) {
                f[i] = val;
        }
}

int
main(int argc, const char *argv[])
{
        char *fb = NULL;
        int fd;
        int i;
        struct timespec tv_start, tv_end, tv_diff;

        fd = open("/dev/fb0", O_RDWR);
        if (fd < 0) {
                err(1, "%s: open", __func__);
        }

        fb = mmap(NULL, FB_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
        if (fb == MAP_FAILED) {
                err(1, "%s: mmap", __func__);
        }

        clock_gettime(CLOCK_MONOTONIC_PRECISE, &tv_start);
        for (i = 0; i < 100; i++)
                fill_1byte(fb, i);
        clock_gettime(CLOCK_MONOTONIC_PRECISE, &tv_end);
        tv_diff = ts_diff(tv_start, tv_end);
        printf("8 bit: 100 runs: %lld.%06lld sec\n",
            (long long) tv_diff.tv_sec,
            (long long) tv_diff.tv_nsec);

        clock_gettime(CLOCK_MONOTONIC_PRECISE, &tv_start);
        for (i = 0; i < 100; i++)
                fill_2byte(fb, i);
        clock_gettime(CLOCK_MONOTONIC_PRECISE, &tv_end);
        tv_diff = ts_diff(tv_start, tv_end);
        printf("16 bit: 100 runs: %lld.%06lld sec\n",
            (long long) tv_diff.tv_sec,
            (long long) tv_diff.tv_nsec);

        clock_gettime(CLOCK_MONOTONIC_PRECISE, &tv_start);
        for (i = 0; i < 100; i++)
                fill_4byte(fb, i);
        clock_gettime(CLOCK_MONOTONIC_PRECISE, &tv_end);
        tv_diff = ts_diff(tv_start, tv_end);
        printf("32 bit: 100 runs: %lld.%06lld sec\n",
            (long long) tv_diff.tv_sec,
            (long long) tv_diff.tv_nsec);

        exit(0);
}

.. and the output:

root at raspberry-pi:~ # ./test 
8 bit: 100 runs: 4.15364000 sec
16 bit: 100 runs: 2.107316000 sec
32 bit: 100 runs: 1.12614000 sec
root at raspberry-pi:~ # 

.. so:

* Your work is good and it's still good for people  using syscons, but you
should double-check what's in sys/dev/vt/hw/fb/ to see if there's any
optimisation there;
* To get really fast speed, we should be doing 32 bit stores, not lots of 8 or
16 bit stores. The above test filled the same region of memory but with 8, 16
and 32 bit stores. The difference between 8, 16 and 32 bit is quite
substantial.

-- 
You are receiving this mail because:
You are the assignee for the bug.