From 52ed2fdd221da4f73887738d1908a6b4e787dbfb Mon Sep 17 00:00:00 2001 From: Martin Pulec Date: Tue, 10 Sep 2024 08:30:57 +0200 Subject: [PATCH 1/9] tv.h: replace time conv macros (+ in alsa) Replace time conversion macros defined 2 commits ago between msec and usec. This is proposed (considered) solution to the todo in tv.h - inconsistent unit conversion macros. Rather than value, it might be better to use function-like macros. The issues is that there will be actaully double the amount - A_TO_B and B_TO_A. To keep it in a reasonable range, it would be best to remove the _DBL versions and keep the user typing the arg to double inestead --- src/audio/playback/alsa.c | 23 ++++++++++------------- src/tv.h | 5 +++-- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/src/audio/playback/alsa.c b/src/audio/playback/alsa.c index d8d04a405..efceaf0c7 100644 --- a/src/audio/playback/alsa.c +++ b/src/audio/playback/alsa.c @@ -448,7 +448,7 @@ set_device_buffer(snd_pcm_t *handle, playback_mode_t playback_mode, enum { REC_MIN_BUF_US = 5000, }; - unsigned int buf_len = 0; + unsigned int buf_len_us = 0; int buf_dir = -1; const char *buff_param = get_commandline_param("alsa-playback-buffer"); @@ -456,10 +456,10 @@ set_device_buffer(snd_pcm_t *handle, playback_mode_t playback_mode, buff_param == NULL) { // set minimal value from the configuration space CHECK_OK(snd_pcm_hw_params_set_buffer_time_first( - handle, params, &buf_len, &buf_dir)); + handle, params, &buf_len_us, &buf_dir)); MSG(INFO, "ALSA driver buffer len set to: %lf ms\n", - buf_len / US_IN_1MS_DBL); - if (buf_len <= REC_MIN_BUF_US) { + US_TO_MS((double) buf_len_us)); + if (buf_len_us <= REC_MIN_BUF_US) { MSG(WARNING, "ALSA driver buffer len less than %d usec seem to " "be too loow, consider using alsa-playback-buffer " @@ -469,22 +469,19 @@ set_device_buffer(snd_pcm_t *handle, playback_mode_t playback_mode, return; } - if (buff_param != NULL) { - buf_len = atoi(buff_param); - } else { - buf_len = (playback_mode == SYNC ? BUF_LEN_DEFAULT_SYNC_MS - : BUF_LEN_DEFAULT_MS) * - US_IN_1MS; - } + buf_len_us = buff_param != NULL ? atoi(buff_param) + : MS_TO_US(playback_mode == SYNC + ? BUF_LEN_DEFAULT_SYNC_MS + : BUF_LEN_DEFAULT_MS); const int rc = snd_pcm_hw_params_set_buffer_time_near( - handle, params, &buf_len, &buf_dir); + handle, params, &buf_len_us, &buf_dir); if (rc < 0) { MSG(WARNING, "Warning - unable to set buffer to its size: %s\n", snd_strerror(rc)); } MSG(INFO, "ALSA driver buffer len set to: %lf ms\n", - buf_len / US_IN_1MS_DBL); + US_TO_MS((double) buf_len_us)); } ADD_TO_PARAM("alsa-play-period-size", "* alsa-play-period-size=\n" diff --git a/src/tv.h b/src/tv.h index 53c5eca82..2625592e3 100644 --- a/src/tv.h +++ b/src/tv.h @@ -86,8 +86,6 @@ typedef long long time_ns_t; #define MS_IN_NS_DBL 1000000.0 #define MS_IN_SEC 1000 #define MS_IN_SEC_DBL 1000.0 -#define US_IN_1MS 1000 -#define US_IN_1MS_DBL 1000.0 #define US_IN_SEC 1000000LL #define US_IN_NS 1000LL #define US_IN_SEC_DBL ((double) US_IN_SEC) @@ -97,6 +95,9 @@ typedef long long time_ns_t; #define NS_IN_SEC_DBL ((double) NS_IN_SEC) #define NS_IN_US (NS_IN_SEC/US_IN_SEC) #define NS_IN_US_DBL ((double) NS_IN_US) +#define US_TO_MS(val_us) ((val_us) / 1000) +#define MS_TO_US(val_ms) ((val_ms) * 1000) + static inline time_ns_t get_time_in_ns() { #ifdef HAVE_TIMESPEC_GET struct timespec ts = { 0, 0 }; From be7dcb67c79735041647b4004501782c706d9e10 Mon Sep 17 00:00:00 2001 From: Martin Pulec Date: Tue, 10 Sep 2024 08:30:57 +0200 Subject: [PATCH 2/9] aplay/alsa: print req val if buff fails to set --- src/audio/playback/alsa.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/audio/playback/alsa.c b/src/audio/playback/alsa.c index efceaf0c7..313ae484d 100644 --- a/src/audio/playback/alsa.c +++ b/src/audio/playback/alsa.c @@ -477,8 +477,9 @@ set_device_buffer(snd_pcm_t *handle, playback_mode_t playback_mode, const int rc = snd_pcm_hw_params_set_buffer_time_near( handle, params, &buf_len_us, &buf_dir); if (rc < 0) { - MSG(WARNING, "Warning - unable to set buffer to its size: %s\n", - snd_strerror(rc)); + MSG(WARNING, + "Warning - unable to set buffer to its size %u us: %s\n", + buf_len_us, snd_strerror(rc)); } MSG(INFO, "ALSA driver buffer len set to: %lf ms\n", US_TO_MS((double) buf_len_us)); From b1cd3166c4fab1fc9c9218564a2f3d1dc6772af6 Mon Sep 17 00:00:00 2001 From: Martin Pulec Date: Tue, 10 Sep 2024 09:16:49 +0200 Subject: [PATCH 3/9] Revert "export: crash if nullptr passed" This reverts commit 41439006052f9e82c8350a3a72ef7ffb50f2a88d. Actaully, the transcoding reflector doesn't set the export module, so that it will crash with this. Crasing command: ``` hd-rum-transcode 8M 5004 -c lavc -P 6004 100:: ``` --- src/export.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/export.c b/src/export.c index 807b30591..3edb0560b 100644 --- a/src/export.c +++ b/src/export.c @@ -35,7 +35,6 @@ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include #include #include // for errno, EEXIST #include @@ -341,7 +340,9 @@ static void process_messages(struct exporter *s) { void export_audio(struct exporter *s, struct audio_frame *frame) { - assert(s != NULL); + if(!s){ + return; + } process_messages(s); @@ -354,7 +355,9 @@ void export_audio(struct exporter *s, struct audio_frame *frame) void export_video(struct exporter *s, struct video_frame *frame) { - assert(s != NULL); + if(!s){ + return; + } process_messages(s); From 4d5f7a76acd3b9199edd323deef85a75a10d9003 Mon Sep 17 00:00:00 2001 From: Martin Pulec Date: Tue, 10 Sep 2024 11:35:02 +0200 Subject: [PATCH 4/9] kernels: report elapsed mode in debug Instead of using compile-time DEBUG macro, prefer run-time specified log_level. --- src/cuda_wrapper/kernels.cu | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/cuda_wrapper/kernels.cu b/src/cuda_wrapper/kernels.cu index 967af5106..92fb8a406 100644 --- a/src/cuda_wrapper/kernels.cu +++ b/src/cuda_wrapper/kernels.cu @@ -41,11 +41,8 @@ #include #include -#ifdef DEBUG -#define D_PRINTF printf -#else -#define D_PRINTF(...) -#endif +extern volatile int log_level; +#define LOG_LEVEL_DEBUG 7 #define MEASURE_KERNEL_DURATION_START(stream) \ cudaEvent_t t0, t1; \ @@ -57,7 +54,9 @@ cudaEventSynchronize(t1); \ float elapsedTime = NAN; \ cudaEventElapsedTime(&elapsedTime, t0, t1); \ - D_PRINTF("%s elapsed time: %f ms\n", __func__, elapsedTime); \ + if (log_level >= LOG_LEVEL_DEBUG) { \ + printf("%s elapsed time: %f ms\n", __func__, elapsedTime); \ + } \ if (elapsedTime > 10.0) { \ fprintf( \ stderr, \ From f17b0d848707c9a873420ab47d7e5c063c918fe2 Mon Sep 17 00:00:00 2001 From: Martin Pulec Date: Tue, 10 Sep 2024 11:36:52 +0200 Subject: [PATCH 5/9] r12l_to_rg48_compute_blk: optimize load/store uint32_t to optimize the performance This reduces the duration from some 16.6 ms to 0.6 ms for 4096x216 on 1080 Ti. refers to GH-406 --- src/cuda_wrapper/kernels.cu | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/cuda_wrapper/kernels.cu b/src/cuda_wrapper/kernels.cu index 92fb8a406..93a864309 100644 --- a/src/cuda_wrapper/kernels.cu +++ b/src/cuda_wrapper/kernels.cu @@ -282,7 +282,7 @@ kernel_r12l_to_rg48(uint8_t *in, uint8_t *out, unsigned size_x) if (position_x == size_x / 8) { // compute the last incomplete block - uint8_t tmp[48]; + alignas(uint32_t) uint8_t tmp[48]; r12l_to_rg48_compute_blk(src, tmp); for (unsigned i = 0; i < (size_x - position_x * 8) * 6; ++i) { dst[i] = tmp[i]; @@ -294,8 +294,19 @@ kernel_r12l_to_rg48(uint8_t *in, uint8_t *out, unsigned size_x) /// adapted variant of @ref vc_copylineR12LtoRG48 __device__ static void -r12l_to_rg48_compute_blk(const uint8_t *src, uint8_t *dst) +r12l_to_rg48_compute_blk(const uint8_t *in, uint8_t *out) { + // load the data from in to src_u32 + auto *in_u32 = (uint32_t *) in; + uint32_t src_u32[9]; + for (unsigned i = 0; i < sizeof src_u32 / sizeof src_u32[0]; ++i) { + src_u32[i] = in_u32[i]; + } + + uint32_t dst_u32[12]; + uint8_t *dst = (uint8_t *) dst_u32; + uint8_t *src = (uint8_t *) src_u32; + // 0 // R *dst++ = src[0] << 4; @@ -376,6 +387,12 @@ r12l_to_rg48_compute_blk(const uint8_t *src, uint8_t *dst) *dst++ = src[32 + 2] & 0xF0; *dst++ = src[32 + 3]; + + // store the result + auto *out_u32 = (uint32_t *) out; + for (unsigned i = 0; i < sizeof dst_u32 / sizeof dst_u32[0]; ++i) { + out_u32[i] = dst_u32[i]; + } } void From dfba5794375c611b43fb9951751e5c52c58aa649 Mon Sep 17 00:00:00 2001 From: Martin Pulec Date: Tue, 10 Sep 2024 12:16:55 +0200 Subject: [PATCH 6/9] rt48_to_r12l_compute_blk: optimize as well see the previous commit Duration for 4096x2160 reduction is from some 18.5 to 0.5 ms. refers to GH-406 --- src/cuda_wrapper/kernels.cu | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/cuda_wrapper/kernels.cu b/src/cuda_wrapper/kernels.cu index 93a864309..456cf6f4c 100644 --- a/src/cuda_wrapper/kernels.cu +++ b/src/cuda_wrapper/kernels.cu @@ -73,8 +73,19 @@ extern volatile int log_level; * modified @ref vc_copylineRG48toR12L */ __device__ static void -rt48_to_r12l_compute_blk(const uint8_t *src, uint8_t *dst) +rt48_to_r12l_compute_blk(const uint8_t *in, uint8_t *out) { + // load the data from in to src_u32 + auto *in_u32 = (uint32_t *) in; + uint32_t src_u32[12]; + for (unsigned i = 0; i < sizeof src_u32 / sizeof src_u32[0]; ++i) { + src_u32[i] = in_u32[i]; + } + + uint32_t dst_u32[9]; + auto *dst = (uint8_t *) dst_u32; + auto *src = (uint8_t *) src_u32; + // 0 dst[0] = src[0] >> 4; dst[0] |= src[1] << 4; @@ -190,12 +201,18 @@ rt48_to_r12l_compute_blk(const uint8_t *src, uint8_t *dst) dst[32 + 2] |= src[0] & 0xF0; dst[32 + 3] = src[1]; src += 2; + + // store the result + auto *out_u32 = (uint32_t *) out; + for (unsigned i = 0; i < sizeof dst_u32 / sizeof dst_u32[0]; ++i) { + out_u32[i] = dst_u32[i]; + } } __device__ static void rt48_to_r12l_compute_last_blk(uint8_t *src, uint8_t *dst, unsigned width) { - uint8_t tmp[48]; + alignas(uint32_t) uint8_t tmp[48]; for (unsigned i = 0; i < width * 6; ++i) { tmp[i] = src[i]; } From 4c12bc85dae7983b6f3da44dd23b3e8d0892df07 Mon Sep 17 00:00:00 2001 From: Martin Pulec Date: Tue, 10 Sep 2024 13:46:32 +0200 Subject: [PATCH 7/9] r12l_to_rg48_compute_blk: fixed odd widths Fixes unaligned access introduced in HEAD~2 (optimizing the r12l_to_rg48 kernel). --- src/cuda_wrapper/kernels.cu | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/src/cuda_wrapper/kernels.cu b/src/cuda_wrapper/kernels.cu index 456cf6f4c..52d68210e 100644 --- a/src/cuda_wrapper/kernels.cu +++ b/src/cuda_wrapper/kernels.cu @@ -282,9 +282,11 @@ int postprocess_rg48_to_r12l( // / , _/ / / / __/ / /__/___/ > > / , _// (_ / /_ _// _ | // /_/|_| /_/ /____/ /____/ /_/ /_/|_| \___/ /_/ \___/ +template __device__ static void r12l_to_rg48_compute_blk(const uint8_t *src, uint8_t *dst); +template __global__ static void kernel_r12l_to_rg48(uint8_t *in, uint8_t *out, unsigned size_x) { @@ -300,16 +302,17 @@ kernel_r12l_to_rg48(uint8_t *in, uint8_t *out, unsigned size_x) if (position_x == size_x / 8) { // compute the last incomplete block alignas(uint32_t) uint8_t tmp[48]; - r12l_to_rg48_compute_blk(src, tmp); + r12l_to_rg48_compute_blk(src, tmp); for (unsigned i = 0; i < (size_x - position_x * 8) * 6; ++i) { dst[i] = tmp[i]; } return; } - r12l_to_rg48_compute_blk(src, dst); + r12l_to_rg48_compute_blk(src, dst); } /// adapted variant of @ref vc_copylineR12LtoRG48 +template __device__ static void r12l_to_rg48_compute_blk(const uint8_t *in, uint8_t *out) { @@ -406,9 +409,15 @@ r12l_to_rg48_compute_blk(const uint8_t *in, uint8_t *out) *dst++ = src[32 + 3]; // store the result - auto *out_u32 = (uint32_t *) out; + auto *out_t = (store_t *) out; for (unsigned i = 0; i < sizeof dst_u32 / sizeof dst_u32[0]; ++i) { - out_u32[i] = dst_u32[i]; + static_assert(sizeof(store_t) == 2 || sizeof(store_t) == 4); + if constexpr (sizeof(store_t) == 4) { + out_t[i] = dst_u32[i]; + } else { + out_t[2 * i] = dst_u32[i] & 0xFFFFU; + out_t[2 * i + 1] = dst_u32[i] >> 16; + } } } @@ -420,8 +429,20 @@ preprocess_r12l_to_rg48(int width, int height, void *src, void *dst) dim3 blocks((((width + 7) / 8) + 255) / 256, height); MEASURE_KERNEL_DURATION_START(0) - kernel_r12l_to_rg48<<>>( - (uint8_t *) src, (uint8_t *) dst, width); + if (width % 2 == 0) { + kernel_r12l_to_rg48<<>>( + (uint8_t *) src, (uint8_t *) dst, width); + } else { + thread_local bool warn_print; + if (!warn_print) { + fprintf(stderr, + "%s: Odd width %d px will use slower kernel!\n", + __func__, width); + warn_print = true; + } + kernel_r12l_to_rg48<<>>( + (uint8_t *) src, (uint8_t *) dst, width); + } MEASURE_KERNEL_DURATION_STOP(0) } From faa6a7bd7291b5a96ee6b176dd9859377628121c Mon Sep 17 00:00:00 2001 From: Martin Pulec Date: Tue, 10 Sep 2024 14:12:08 +0200 Subject: [PATCH 8/9] rt48_to_r12l_compute_blk: fixed odd widths see the previous commit as well --- src/cuda_wrapper/kernels.cu | 49 +++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/src/cuda_wrapper/kernels.cu b/src/cuda_wrapper/kernels.cu index 52d68210e..b3a6accb0 100644 --- a/src/cuda_wrapper/kernels.cu +++ b/src/cuda_wrapper/kernels.cu @@ -72,15 +72,21 @@ extern volatile int log_level; /** * modified @ref vc_copylineRG48toR12L */ +template __device__ static void rt48_to_r12l_compute_blk(const uint8_t *in, uint8_t *out) { - // load the data from in to src_u32 - auto *in_u32 = (uint32_t *) in; - uint32_t src_u32[12]; - for (unsigned i = 0; i < sizeof src_u32 / sizeof src_u32[0]; ++i) { - src_u32[i] = in_u32[i]; - } + // load the data from in to src_u32 + auto *in_t = (load_t *) in; + uint32_t src_u32[12]; + for (unsigned i = 0; i < sizeof src_u32 / sizeof src_u32[0]; ++i) { + static_assert(sizeof(load_t) == 2 || sizeof(load_t) == 4); + if constexpr (sizeof(load_t) == 4) { + src_u32[i] = in_t[i]; + } else { + src_u32[i] = in_t[2 * i] | in_t[2 * i + 1] << 16; + } + } uint32_t dst_u32[9]; auto *dst = (uint8_t *) dst_u32; @@ -209,6 +215,7 @@ rt48_to_r12l_compute_blk(const uint8_t *in, uint8_t *out) } } +template __device__ static void rt48_to_r12l_compute_last_blk(uint8_t *src, uint8_t *dst, unsigned width) { @@ -216,12 +223,13 @@ rt48_to_r12l_compute_last_blk(uint8_t *src, uint8_t *dst, unsigned width) for (unsigned i = 0; i < width * 6; ++i) { tmp[i] = src[i]; } - rt48_to_r12l_compute_blk(tmp, dst); + rt48_to_r12l_compute_blk(tmp, dst); } /** * @todo fix the last block for widths not divisible by 8 */ +template __global__ static void kernel_rg48_to_r12l(uint8_t *in, uint8_t *out, unsigned size_x) { @@ -236,11 +244,11 @@ kernel_rg48_to_r12l(uint8_t *in, uint8_t *out, unsigned size_x) // handle incomplete blocks if (position_x == size_x / 8) { - rt48_to_r12l_compute_last_blk(src, dst, - size_x - position_x * 8); + rt48_to_r12l_compute_last_blk(src, dst, + size_x - position_x * 8); return; } - rt48_to_r12l_compute_blk(src, dst); + rt48_to_r12l_compute_blk(src, dst); } /** @@ -268,9 +276,24 @@ int postprocess_rg48_to_r12l( MEASURE_KERNEL_DURATION_START(stream) - kernel_rg48_to_r12l<<>>( - (uint8_t *) input_samples, (uint8_t *) output_buffer, size_x); + if (size_x % 2 == 0) { + kernel_rg48_to_r12l + <<>>( + (uint8_t *) input_samples, (uint8_t *) output_buffer, + size_x); + } else { + thread_local bool warn_print; + if (!warn_print) { + fprintf(stderr, + "%s: Odd width %d px will use slower kernel!\n", + __func__, size_x); + warn_print = true; + } + kernel_rg48_to_r12l + <<>>( + (uint8_t *) input_samples, (uint8_t *) output_buffer, + size_x); + } MEASURE_KERNEL_DURATION_STOP(stream) From 8b7db976c8561845a63ec260f8c9e2baf4de5cab Mon Sep 17 00:00:00 2001 From: Martin Pulec Date: Tue, 10 Sep 2024 14:34:15 +0200 Subject: [PATCH 9/9] cmpto_j2k: print also cpu convs duration in debug --- src/tv.h | 1 + src/video_compress/cmpto_j2k.cpp | 5 +++++ src/video_decompress/cmpto_j2k.cpp | 5 +++++ 3 files changed, 11 insertions(+) diff --git a/src/tv.h b/src/tv.h index 2625592e3..619c226e8 100644 --- a/src/tv.h +++ b/src/tv.h @@ -97,6 +97,7 @@ typedef long long time_ns_t; #define NS_IN_US_DBL ((double) NS_IN_US) #define US_TO_MS(val_us) ((val_us) / 1000) #define MS_TO_US(val_ms) ((val_ms) * 1000) +#define NS_TO_MS(val_ns) ((val_ns) / 1000 / 1000) static inline time_ns_t get_time_in_ns() { #ifdef HAVE_TIMESPEC_GET diff --git a/src/video_compress/cmpto_j2k.cpp b/src/video_compress/cmpto_j2k.cpp index 926991d47..8eb2a946c 100644 --- a/src/video_compress/cmpto_j2k.cpp +++ b/src/video_compress/cmpto_j2k.cpp @@ -161,9 +161,14 @@ static void parallel_conv(video_frame *dst, video_frame *src){ decoder_t decoder = get_decoder_from_to(src->color_spec, dst->color_spec); assert(decoder != nullptr); + time_ns_t t0 = get_time_in_ns(); parallel_pix_conv((int) src->tiles[0].height, dst->tiles[0].data, dst_pitch, src->tiles[0].data, src_pitch, decoder, 0); + if (log_level >= LOG_LEVEL_DEBUG) { + MSG(DEBUG, "pixfmt conversion duration: %f ms\n", + NS_TO_MS((double) (get_time_in_ns() - t0))); + } } #ifdef HAVE_CUDA diff --git a/src/video_decompress/cmpto_j2k.cpp b/src/video_decompress/cmpto_j2k.cpp index 479c2851d..f0c77c7ce 100644 --- a/src/video_decompress/cmpto_j2k.cpp +++ b/src/video_decompress/cmpto_j2k.cpp @@ -140,9 +140,14 @@ static void rg48_to_r12l(unsigned char *dst_buffer, int dst_len = vc_get_linesize(width, R12L); decoder_t vc_copylineRG48toR12L = get_decoder_from_to(RG48, R12L); + time_ns_t t0 = get_time_in_ns(); parallel_pix_conv((int) height, (char *) dst_buffer, dst_len, (const char *) src_buffer, src_pitch, vc_copylineRG48toR12L, 0); + if (log_level >= LOG_LEVEL_DEBUG) { + MSG(DEBUG, "pixfmt conversion duration: %f ms\n", + NS_TO_MS((double) (get_time_in_ns() - t0))); + } } static void print_dropped(unsigned long long int dropped) {