From 52ed2fdd221da4f73887738d1908a6b4e787dbfb Mon Sep 17 00:00:00 2001
From: Martin Pulec <martin.pulec@cesnet.cz>
Date: Tue, 10 Sep 2024 08:30:57 +0200
Subject: [PATCH 1/9] tv.h: replace time conv macros (+ in alsa)

Replace time conversion macros defined 2 commits ago between msec
and usec.

This is proposed (considered) solution to the todo in tv.h - inconsistent
unit conversion macros. Rather than value, it might be better to use
function-like macros.

The issues is that there will be actaully double the amount - A_TO_B and
B_TO_A. To keep it in a reasonable range, it would be best to remove
the _DBL versions and keep the user typing the arg to double inestead
---
 src/audio/playback/alsa.c | 23 ++++++++++-------------
 src/tv.h                  |  5 +++--
 2 files changed, 13 insertions(+), 15 deletions(-)
diff --git a/src/audio/playback/alsa.c b/src/audio/playback/alsa.c
index d8d04a405..efceaf0c7 100644
--- a/src/audio/playback/alsa.c
+++ b/src/audio/playback/alsa.c
@@ -448,7 +448,7 @@ set_device_buffer(snd_pcm_t *handle, playback_mode_t playback_mode,
         enum {
                 REC_MIN_BUF_US = 5000,
         };
-        unsigned int buf_len = 0;
+        unsigned int buf_len_us = 0;
         int          buf_dir = -1;
         const char  *buff_param = get_commandline_param("alsa-playback-buffer");
 
@@ -456,10 +456,10 @@ set_device_buffer(snd_pcm_t *handle, playback_mode_t playback_mode,
             buff_param == NULL) {
                 // set minimal value from the configuration space
                 CHECK_OK(snd_pcm_hw_params_set_buffer_time_first(
-                    handle, params, &buf_len, &buf_dir));
+                    handle, params, &buf_len_us, &buf_dir));
                 MSG(INFO, "ALSA driver buffer len set to: %lf ms\n",
-                    buf_len / US_IN_1MS_DBL);
-                if (buf_len <= REC_MIN_BUF_US) {
+                    US_TO_MS((double) buf_len_us));
+                if (buf_len_us <= REC_MIN_BUF_US) {
                         MSG(WARNING,
                             "ALSA driver buffer len less than %d usec seem to "
                             "be too loow, consider using alsa-playback-buffer "
@@ -469,22 +469,19 @@ set_device_buffer(snd_pcm_t *handle, playback_mode_t playback_mode,
                 return;
         }
 
-        if (buff_param != NULL) {
-                buf_len = atoi(buff_param);
-        } else {
-                buf_len = (playback_mode == SYNC ? BUF_LEN_DEFAULT_SYNC_MS
-                                                 : BUF_LEN_DEFAULT_MS) *
-                          US_IN_1MS;
-        }
+        buf_len_us = buff_param != NULL ? atoi(buff_param)
+                                        : MS_TO_US(playback_mode == SYNC
+                                                       ? BUF_LEN_DEFAULT_SYNC_MS
+                                                       : BUF_LEN_DEFAULT_MS);
 
         const int rc = snd_pcm_hw_params_set_buffer_time_near(
-            handle, params, &buf_len, &buf_dir);
+            handle, params, &buf_len_us, &buf_dir);
         if (rc < 0) {
                 MSG(WARNING, "Warning - unable to set buffer to its size: %s\n",
                     snd_strerror(rc));
         }
         MSG(INFO, "ALSA driver buffer len set to: %lf ms\n",
-            buf_len / US_IN_1MS_DBL);
+            US_TO_MS((double) buf_len_us));
 }
 
 ADD_TO_PARAM("alsa-play-period-size", "* alsa-play-period-size=<frames>\n"
diff --git a/src/tv.h b/src/tv.h
index 53c5eca82..2625592e3 100644
--- a/src/tv.h
+++ b/src/tv.h
@@ -86,8 +86,6 @@ typedef long long time_ns_t;
 #define MS_IN_NS_DBL 1000000.0
 #define MS_IN_SEC 1000
 #define MS_IN_SEC_DBL 1000.0
-#define US_IN_1MS 1000
-#define US_IN_1MS_DBL 1000.0
 #define US_IN_SEC 1000000LL
 #define US_IN_NS 1000LL
 #define US_IN_SEC_DBL ((double) US_IN_SEC)
@@ -97,6 +95,9 @@ typedef long long time_ns_t;
 #define NS_IN_SEC_DBL ((double) NS_IN_SEC)
 #define NS_IN_US (NS_IN_SEC/US_IN_SEC)
 #define NS_IN_US_DBL ((double) NS_IN_US)
+#define US_TO_MS(val_us) ((val_us) / 1000)
+#define MS_TO_US(val_ms) ((val_ms) * 1000)
+
 static inline time_ns_t get_time_in_ns() {
 #ifdef HAVE_TIMESPEC_GET
         struct timespec ts = { 0, 0 };

From be7dcb67c79735041647b4004501782c706d9e10 Mon Sep 17 00:00:00 2001
From: Martin Pulec <martin.pulec@cesnet.cz>
Date: Tue, 10 Sep 2024 08:30:57 +0200
Subject: [PATCH 2/9] aplay/alsa: print req val if buff fails to set

---
 src/audio/playback/alsa.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/audio/playback/alsa.c b/src/audio/playback/alsa.c
index efceaf0c7..313ae484d 100644
--- a/src/audio/playback/alsa.c
+++ b/src/audio/playback/alsa.c
@@ -477,8 +477,9 @@ set_device_buffer(snd_pcm_t *handle, playback_mode_t playback_mode,
         const int rc = snd_pcm_hw_params_set_buffer_time_near(
             handle, params, &buf_len_us, &buf_dir);
         if (rc < 0) {
-                MSG(WARNING, "Warning - unable to set buffer to its size: %s\n",
-                    snd_strerror(rc));
+                MSG(WARNING,
+                    "Warning - unable to set buffer to its size %u us: %s\n",
+                    buf_len_us, snd_strerror(rc));
         }
         MSG(INFO, "ALSA driver buffer len set to: %lf ms\n",
             US_TO_MS((double) buf_len_us));

From b1cd3166c4fab1fc9c9218564a2f3d1dc6772af6 Mon Sep 17 00:00:00 2001
From: Martin Pulec <martin.pulec@cesnet.cz>
Date: Tue, 10 Sep 2024 09:16:49 +0200
Subject: [PATCH 3/9] Revert "export: crash if nullptr passed"

This reverts commit 41439006052f9e82c8350a3a72ef7ffb50f2a88d.

Actaully, the transcoding reflector doesn't set the export module,
so that it will crash with this. Crasing command:
```
hd-rum-transcode 8M 5004 -c lavc -P 6004 100::
```
---
 src/export.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/export.c b/src/export.c
index 807b30591..3edb0560b 100644
--- a/src/export.c
+++ b/src/export.c
@@ -35,7 +35,6 @@
  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <assert.h>
 #include <dirent.h>
 #include <errno.h>            // for errno, EEXIST
 #include <limits.h>
@@ -341,7 +340,9 @@ static void process_messages(struct exporter *s) {
 
 void export_audio(struct exporter *s, struct audio_frame *frame)
 {
-        assert(s != NULL);
+        if(!s){
+                return;
+        }
 
         process_messages(s);
 
@@ -354,7 +355,9 @@ void export_audio(struct exporter *s, struct audio_frame *frame)
 
 void export_video(struct exporter *s, struct video_frame *frame)
 {
-        assert(s != NULL);
+        if(!s){
+                return;
+        }
 
         process_messages(s);
 

From 4d5f7a76acd3b9199edd323deef85a75a10d9003 Mon Sep 17 00:00:00 2001
From: Martin Pulec <martin.pulec@cesnet.cz>
Date: Tue, 10 Sep 2024 11:35:02 +0200
Subject: [PATCH 4/9] kernels: report elapsed mode in debug

Instead of using compile-time DEBUG macro, prefer run-time specified
log_level.
---
 src/cuda_wrapper/kernels.cu | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/cuda_wrapper/kernels.cu b/src/cuda_wrapper/kernels.cu
index 967af5106..92fb8a406 100644
--- a/src/cuda_wrapper/kernels.cu
+++ b/src/cuda_wrapper/kernels.cu
@@ -41,11 +41,8 @@
 #include <cstdio>
 #include <cuda_runtime_api.h>
 
-#ifdef DEBUG
-#define D_PRINTF printf
-#else
-#define D_PRINTF(...)
-#endif
+extern volatile int log_level;
+#define LOG_LEVEL_DEBUG 7
 
 #define MEASURE_KERNEL_DURATION_START(stream) \
         cudaEvent_t t0, t1; \
@@ -57,7 +54,9 @@
         cudaEventSynchronize(t1); \
         float elapsedTime = NAN; \
         cudaEventElapsedTime(&elapsedTime, t0, t1); \
-        D_PRINTF("%s elapsed time: %f ms\n", __func__, elapsedTime); \
+        if (log_level >= LOG_LEVEL_DEBUG) { \
+                printf("%s elapsed time: %f ms\n", __func__, elapsedTime); \
+        } \
         if (elapsedTime > 10.0) { \
                 fprintf( \
                     stderr, \

From f17b0d848707c9a873420ab47d7e5c063c918fe2 Mon Sep 17 00:00:00 2001
From: Martin Pulec <martin.pulec@cesnet.cz>
Date: Tue, 10 Sep 2024 11:36:52 +0200
Subject: [PATCH 5/9] r12l_to_rg48_compute_blk: optimize

load/store uint32_t to optimize the performance

This reduces the duration from some 16.6 ms to 0.6 ms for 4096x216 on
1080 Ti.

refers to GH-406
---
 src/cuda_wrapper/kernels.cu | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/cuda_wrapper/kernels.cu b/src/cuda_wrapper/kernels.cu
index 92fb8a406..93a864309 100644
--- a/src/cuda_wrapper/kernels.cu
+++ b/src/cuda_wrapper/kernels.cu
@@ -282,7 +282,7 @@ kernel_r12l_to_rg48(uint8_t *in, uint8_t *out, unsigned size_x)
 
         if (position_x == size_x / 8) {
                 // compute the last incomplete block
-                uint8_t tmp[48];
+                alignas(uint32_t) uint8_t tmp[48];
                 r12l_to_rg48_compute_blk(src, tmp);
                 for (unsigned i = 0; i < (size_x - position_x * 8) * 6; ++i) {
                         dst[i] = tmp[i];
@@ -294,8 +294,19 @@ kernel_r12l_to_rg48(uint8_t *in, uint8_t *out, unsigned size_x)
 
 /// adapted variant of @ref vc_copylineR12LtoRG48
 __device__ static void
-r12l_to_rg48_compute_blk(const uint8_t *src, uint8_t *dst)
+r12l_to_rg48_compute_blk(const uint8_t *in, uint8_t *out)
 {
+        // load the data from in to src_u32
+        auto *in_u32 = (uint32_t *) in;
+        uint32_t src_u32[9];
+        for (unsigned i = 0; i < sizeof src_u32 / sizeof src_u32[0]; ++i) {
+                src_u32[i] = in_u32[i];
+        }
+
+        uint32_t dst_u32[12];
+        uint8_t *dst = (uint8_t *) dst_u32;
+        uint8_t *src = (uint8_t *) src_u32;
+
         // 0
         // R
         *dst++ = src[0] << 4;
@@ -376,6 +387,12 @@ r12l_to_rg48_compute_blk(const uint8_t *src, uint8_t *dst)
 
         *dst++ = src[32 + 2] & 0xF0;
         *dst++ = src[32 + 3];
+
+        // store the result
+        auto *out_u32 = (uint32_t *) out;
+        for (unsigned i = 0; i < sizeof dst_u32 / sizeof dst_u32[0]; ++i) {
+                out_u32[i] = dst_u32[i];
+        }
 }
 
 void

From dfba5794375c611b43fb9951751e5c52c58aa649 Mon Sep 17 00:00:00 2001
From: Martin Pulec <martin.pulec@cesnet.cz>
Date: Tue, 10 Sep 2024 12:16:55 +0200
Subject: [PATCH 6/9] rt48_to_r12l_compute_blk: optimize as well

see the previous commit

Duration for 4096x2160 reduction is from some 18.5 to 0.5 ms.

refers to GH-406
---
 src/cuda_wrapper/kernels.cu | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/cuda_wrapper/kernels.cu b/src/cuda_wrapper/kernels.cu
index 93a864309..456cf6f4c 100644
--- a/src/cuda_wrapper/kernels.cu
+++ b/src/cuda_wrapper/kernels.cu
@@ -73,8 +73,19 @@ extern volatile int log_level;
  * modified @ref vc_copylineRG48toR12L
  */
 __device__ static void
-rt48_to_r12l_compute_blk(const uint8_t *src, uint8_t *dst)
+rt48_to_r12l_compute_blk(const uint8_t *in, uint8_t *out)
 {
+        // load the data from in to src_u32
+        auto *in_u32 = (uint32_t *) in;
+        uint32_t src_u32[12];
+        for (unsigned i = 0; i < sizeof src_u32 / sizeof src_u32[0]; ++i) {
+                src_u32[i] = in_u32[i];
+        }
+
+        uint32_t dst_u32[9];
+        auto *dst = (uint8_t *) dst_u32;
+        auto *src = (uint8_t *) src_u32;
+
         // 0
         dst[0] = src[0] >> 4;
         dst[0] |= src[1] << 4;
@@ -190,12 +201,18 @@ rt48_to_r12l_compute_blk(const uint8_t *src, uint8_t *dst)
         dst[32 + 2] |= src[0] & 0xF0;
         dst[32 + 3] = src[1];
         src += 2;
+
+        // store the result
+        auto *out_u32 = (uint32_t *) out;
+        for (unsigned i = 0; i < sizeof dst_u32 / sizeof dst_u32[0]; ++i) {
+                out_u32[i] = dst_u32[i];
+        }
 }
 
 __device__ static void
 rt48_to_r12l_compute_last_blk(uint8_t *src, uint8_t *dst, unsigned width)
 {
-        uint8_t tmp[48];
+        alignas(uint32_t) uint8_t tmp[48];
         for (unsigned i = 0; i < width * 6; ++i) {
                 tmp[i] = src[i];
         }

From 4c12bc85dae7983b6f3da44dd23b3e8d0892df07 Mon Sep 17 00:00:00 2001
From: Martin Pulec <martin.pulec@cesnet.cz>
Date: Tue, 10 Sep 2024 13:46:32 +0200
Subject: [PATCH 7/9] r12l_to_rg48_compute_blk: fixed odd widths

Fixes unaligned access introduced in HEAD~2 (optimizing the r12l_to_rg48
kernel).
---
 src/cuda_wrapper/kernels.cu | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/src/cuda_wrapper/kernels.cu b/src/cuda_wrapper/kernels.cu
index 456cf6f4c..52d68210e 100644
--- a/src/cuda_wrapper/kernels.cu
+++ b/src/cuda_wrapper/kernels.cu
@@ -282,9 +282,11 @@ int postprocess_rg48_to_r12l(
 //   / , _/ / /  / __/  / /__/___/ > > / , _// (_ / /_  _// _  |
 //  /_/|_| /_/  /____/ /____/     /_/ /_/|_| \___/   /_/  \___/
 
+template <typename store_t>
 __device__ static void r12l_to_rg48_compute_blk(const uint8_t *src,
                                                 uint8_t       *dst);
 
+template <typename store_t>
 __global__ static void
 kernel_r12l_to_rg48(uint8_t *in, uint8_t *out, unsigned size_x)
 {
@@ -300,16 +302,17 @@ kernel_r12l_to_rg48(uint8_t *in, uint8_t *out, unsigned size_x)
         if (position_x == size_x / 8) {
                 // compute the last incomplete block
                 alignas(uint32_t) uint8_t tmp[48];
-                r12l_to_rg48_compute_blk(src, tmp);
+                r12l_to_rg48_compute_blk<store_t>(src, tmp);
                 for (unsigned i = 0; i < (size_x - position_x * 8) * 6; ++i) {
                         dst[i] = tmp[i];
                 }
                 return;
         }
-        r12l_to_rg48_compute_blk(src, dst);
+        r12l_to_rg48_compute_blk<store_t>(src, dst);
 }
 
 /// adapted variant of @ref vc_copylineR12LtoRG48
+template <typename store_t>
 __device__ static void
 r12l_to_rg48_compute_blk(const uint8_t *in, uint8_t *out)
 {
@@ -406,9 +409,15 @@ r12l_to_rg48_compute_blk(const uint8_t *in, uint8_t *out)
         *dst++ = src[32 + 3];
 
         // store the result
-        auto *out_u32 = (uint32_t *) out;
+        auto *out_t = (store_t *) out;
         for (unsigned i = 0; i < sizeof dst_u32 / sizeof dst_u32[0]; ++i) {
-                out_u32[i] = dst_u32[i];
+                static_assert(sizeof(store_t) == 2 || sizeof(store_t) == 4);
+                if constexpr (sizeof(store_t) == 4) {
+                        out_t[i] = dst_u32[i];
+                } else  {
+                        out_t[2 * i] = dst_u32[i] & 0xFFFFU;
+                        out_t[2 * i + 1] = dst_u32[i] >> 16;
+                }
         }
 }
 
@@ -420,8 +429,20 @@ preprocess_r12l_to_rg48(int width, int height, void *src, void *dst)
         dim3 blocks((((width + 7) / 8) + 255) / 256, height);
 
         MEASURE_KERNEL_DURATION_START(0)
-        kernel_r12l_to_rg48<<<blocks, threads_per_block>>>(
-            (uint8_t *) src, (uint8_t *) dst, width);
+        if (width % 2 == 0) {
+                kernel_r12l_to_rg48<uint32_t><<<blocks, threads_per_block>>>(
+                    (uint8_t *) src, (uint8_t *) dst, width);
+        } else {
+                thread_local bool warn_print;
+                if (!warn_print) {
+                        fprintf(stderr,
+                                "%s: Odd width %d px will use slower kernel!\n",
+                                __func__, width);
+                        warn_print = true;
+                }
+                kernel_r12l_to_rg48<uint16_t><<<blocks, threads_per_block>>>(
+                    (uint8_t *) src, (uint8_t *) dst, width);
+        }
         MEASURE_KERNEL_DURATION_STOP(0)
 }
 

From faa6a7bd7291b5a96ee6b176dd9859377628121c Mon Sep 17 00:00:00 2001
From: Martin Pulec <martin.pulec@cesnet.cz>
Date: Tue, 10 Sep 2024 14:12:08 +0200
Subject: [PATCH 8/9] rt48_to_r12l_compute_blk: fixed odd widths

see the previous commit as well
---
 src/cuda_wrapper/kernels.cu | 49 +++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 13 deletions(-)

diff --git a/src/cuda_wrapper/kernels.cu b/src/cuda_wrapper/kernels.cu
index 52d68210e..b3a6accb0 100644
--- a/src/cuda_wrapper/kernels.cu
+++ b/src/cuda_wrapper/kernels.cu
@@ -72,15 +72,21 @@ extern volatile int log_level;
 /**
  * modified @ref vc_copylineRG48toR12L
  */
+template <typename load_t>
 __device__ static void
 rt48_to_r12l_compute_blk(const uint8_t *in, uint8_t *out)
 {
-        // load the data from in to src_u32
-        auto *in_u32 = (uint32_t *) in;
-        uint32_t src_u32[12];
-        for (unsigned i = 0; i < sizeof src_u32 / sizeof src_u32[0]; ++i) {
-                src_u32[i] = in_u32[i];
-        }
+         // load the data from in to src_u32
+         auto    *in_t = (load_t *) in;
+         uint32_t src_u32[12];
+         for (unsigned i = 0; i < sizeof src_u32 / sizeof src_u32[0]; ++i) {
+                 static_assert(sizeof(load_t) == 2 || sizeof(load_t) == 4);
+                 if constexpr (sizeof(load_t) == 4) {
+                         src_u32[i] = in_t[i];
+                 } else {
+                         src_u32[i] = in_t[2 * i] | in_t[2 * i + 1] << 16;
+                 }
+         }
 
         uint32_t dst_u32[9];
         auto *dst = (uint8_t *) dst_u32;
@@ -209,6 +215,7 @@ rt48_to_r12l_compute_blk(const uint8_t *in, uint8_t *out)
         }
 }
 
+template <typename load_t>
 __device__ static void
 rt48_to_r12l_compute_last_blk(uint8_t *src, uint8_t *dst, unsigned width)
 {
@@ -216,12 +223,13 @@ rt48_to_r12l_compute_last_blk(uint8_t *src, uint8_t *dst, unsigned width)
         for (unsigned i = 0; i < width * 6; ++i) {
                 tmp[i] = src[i];
         }
-        rt48_to_r12l_compute_blk(tmp, dst);
+        rt48_to_r12l_compute_blk<load_t>(tmp, dst);
 }
 
 /**
  * @todo fix the last block for widths not divisible by 8
  */
+template <typename load_t>
 __global__ static void
 kernel_rg48_to_r12l(uint8_t *in, uint8_t *out, unsigned size_x)
 {
@@ -236,11 +244,11 @@ kernel_rg48_to_r12l(uint8_t *in, uint8_t *out, unsigned size_x)
 
         // handle incomplete blocks
         if (position_x == size_x / 8) {
-                rt48_to_r12l_compute_last_blk(src, dst,
-                                              size_x - position_x * 8);
+                rt48_to_r12l_compute_last_blk<load_t>(src, dst,
+                                                      size_x - position_x * 8);
                 return;
         }
-        rt48_to_r12l_compute_blk(src, dst);
+        rt48_to_r12l_compute_blk<load_t>(src, dst);
 }
 
 /**
@@ -268,9 +276,24 @@ int postprocess_rg48_to_r12l(
 
         MEASURE_KERNEL_DURATION_START(stream)
 
-        kernel_rg48_to_r12l<<<blocks, threads_per_block, 0,
-                              (cudaStream_t) stream>>>(
-            (uint8_t *) input_samples, (uint8_t *) output_buffer, size_x);
+        if (size_x % 2 == 0) {
+                kernel_rg48_to_r12l<uint32_t>
+                    <<<blocks, threads_per_block, 0, (cudaStream_t) stream>>>(
+                        (uint8_t *) input_samples, (uint8_t *) output_buffer,
+                        size_x);
+        } else {
+                thread_local bool warn_print;
+                if (!warn_print) {
+                        fprintf(stderr,
+                                "%s: Odd width %d px will use slower kernel!\n",
+                                __func__, size_x);
+                        warn_print = true;
+                }
+                kernel_rg48_to_r12l<uint16_t>
+                    <<<blocks, threads_per_block, 0, (cudaStream_t) stream>>>(
+                        (uint8_t *) input_samples, (uint8_t *) output_buffer,
+                        size_x);
+        }
 
         MEASURE_KERNEL_DURATION_STOP(stream)
 

From 8b7db976c8561845a63ec260f8c9e2baf4de5cab Mon Sep 17 00:00:00 2001
From: Martin Pulec <martin.pulec@cesnet.cz>
Date: Tue, 10 Sep 2024 14:34:15 +0200
Subject: [PATCH 9/9] cmpto_j2k: print also cpu convs duration

in debug
---
 src/tv.h                           | 1 +
 src/video_compress/cmpto_j2k.cpp   | 5 +++++
 src/video_decompress/cmpto_j2k.cpp | 5 +++++
 3 files changed, 11 insertions(+)

diff --git a/src/tv.h b/src/tv.h
index 2625592e3..619c226e8 100644
--- a/src/tv.h
+++ b/src/tv.h
@@ -97,6 +97,7 @@ typedef long long time_ns_t;
 #define NS_IN_US_DBL ((double) NS_IN_US)
 #define US_TO_MS(val_us) ((val_us) / 1000)
 #define MS_TO_US(val_ms) ((val_ms) * 1000)
+#define NS_TO_MS(val_ns) ((val_ns) / 1000 / 1000)
 
 static inline time_ns_t get_time_in_ns() {
 #ifdef HAVE_TIMESPEC_GET
diff --git a/src/video_compress/cmpto_j2k.cpp b/src/video_compress/cmpto_j2k.cpp
index 926991d47..8eb2a946c 100644
--- a/src/video_compress/cmpto_j2k.cpp
+++ b/src/video_compress/cmpto_j2k.cpp
@@ -161,9 +161,14 @@ static void parallel_conv(video_frame *dst, video_frame *src){
         decoder_t decoder =
             get_decoder_from_to(src->color_spec, dst->color_spec);
         assert(decoder != nullptr);
+        time_ns_t t0 = get_time_in_ns();
         parallel_pix_conv((int) src->tiles[0].height, dst->tiles[0].data,
                           dst_pitch, src->tiles[0].data, src_pitch,
                           decoder, 0);
+        if (log_level >= LOG_LEVEL_DEBUG) {
+                MSG(DEBUG, "pixfmt conversion duration: %f ms\n",
+                    NS_TO_MS((double) (get_time_in_ns() - t0)));
+        }
 }
 
 #ifdef HAVE_CUDA
diff --git a/src/video_decompress/cmpto_j2k.cpp b/src/video_decompress/cmpto_j2k.cpp
index 479c2851d..f0c77c7ce 100644
--- a/src/video_decompress/cmpto_j2k.cpp
+++ b/src/video_decompress/cmpto_j2k.cpp
@@ -140,9 +140,14 @@ static void rg48_to_r12l(unsigned char *dst_buffer,
         int dst_len = vc_get_linesize(width, R12L);
         decoder_t vc_copylineRG48toR12L = get_decoder_from_to(RG48, R12L);
 
+        time_ns_t t0 = get_time_in_ns();
         parallel_pix_conv((int) height, (char *) dst_buffer, dst_len,
                           (const char *) src_buffer, src_pitch,
                           vc_copylineRG48toR12L, 0);
+        if (log_level >= LOG_LEVEL_DEBUG) {
+                MSG(DEBUG, "pixfmt conversion duration: %f ms\n",
+                    NS_TO_MS((double) (get_time_in_ns() - t0)));
+        }
 }
 
 static void print_dropped(unsigned long long int dropped) {