From fd81cb70676d2b970642cc4f963a0f7ce16ccb20 Mon Sep 17 00:00:00 2001 From: zerico <71151164+ZERICO2005@users.noreply.github.com> Date: Mon, 5 Jan 2026 14:03:06 -0700 Subject: [PATCH 1/4] removed unused mempcpy variant --- src/libc/mempcpy.src | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/src/libc/mempcpy.src b/src/libc/mempcpy.src index 4e0a1c96e..7415fa75b 100644 --- a/src/libc/mempcpy.src +++ b/src/libc/mempcpy.src @@ -5,26 +5,6 @@ .global _mempcpy .type _mempcpy, @function -.if 0 - -; faster when count is zero -_mempcpy: - ld iy, -1 - add iy, sp - ld bc, (iy + 10) ; Load count - sbc hl, hl - add hl, bc - ld hl, (iy + 4) ; Load destination - ret nc ; zero bytes to copy - ld de, (iy + 7) ; Load source - ex de, hl - ldir - ex de, hl - ret - -.else - -; faster in full execution case by 0F + 1 clock cycles _mempcpy: ld iy, -1 add iy, sp @@ -38,5 +18,3 @@ _mempcpy: .L.zero_byte_copy: ex de, hl ret - -.endif From 6f3f17cc6b1ff12e672d9d1e9b1dc0e82ac3377b Mon Sep 17 00:00:00 2001 From: zerico <71151164+ZERICO2005@users.noreply.github.com> Date: Mon, 5 Jan 2026 14:17:40 -0700 Subject: [PATCH 2/4] optimized zero size test in memcpy/memmove --- src/libc/memcpy.src | 31 +++++++++---- src/libc/memmove.src | 108 +++++++++++++++++++++++++------------------ 2 files changed, 84 insertions(+), 55 deletions(-) diff --git a/src/libc/memcpy.src b/src/libc/memcpy.src index d969a77b3..8ff480ff5 100644 --- a/src/libc/memcpy.src +++ b/src/libc/memcpy.src @@ -12,17 +12,30 @@ .else _memcpy: - ld iy, -1 + ; size > 0 : 25F + 15R + 1 + LDIR + ; size >= 65536 : 32F + 16R + 3 + LDIR + ; size == 0 : 26F + 13R + 2 + ; size >= 65536 + 7F + 1R + 2 (only when the low 16 bits are zero) + + ld iy, 0 add iy, sp - ld bc, (iy + 10) ; Load count - sbc hl, hl - add hl, bc - jr nc, .L.zero - ld de, (iy + 4) ; Load destination - ld hl, (iy + 7) ; Load source + ld bc, (iy + 9) ; Load count + ld a, c + or a, b + ld de, (iy + 3) ; Load destination + jr z, .L.maybe_zero +.L.not_zero: + ld hl, (iy + 6) ; Load source ldir -.L.zero: - ld hl, (iy + 4) ; Return the destination pointer + ld hl, (iy + 3) ; Return the destination pointer + ret + +.L.maybe_zero: + ; low 16 bits are zero + or a, (iy + 11) ; test upper 8 bits + jr nz, .L.not_zero ; size >= 65536 + ; size == 0 + ex de, hl ret .endif diff --git a/src/libc/memmove.src b/src/libc/memmove.src index e8237bc6e..bb8be82f3 100644 --- a/src/libc/memmove.src +++ b/src/libc/memmove.src @@ -15,87 +15,103 @@ ; Optimized for when src != dst _memmove: - ; src > dst | LDIR | 32F + 15R + 1 - ; src < dst | LDDR | 35F + 12R + 2 - ; src = dst | LDDR | 35F + 12R + 2 - ; zero size | | 24F + 12R + 2 + ; src > dst | LDIR | 31F + 15R + 2 + ; src < dst | LDDR | 33F + 12R + 1 + ; src = dst | LDIR | 31F + 15R + 2 + ; zero size | | 26F + 10R + 2 + ; size >= 65536 + 7F + 1R + 2 (only when the low 16 bits are zero) - ld iy, -1 + ld iy, 0 add iy, sp - ld bc, (iy + 10) - sbc hl, hl - add hl, bc - jr nc, .L.zero - ld hl, (iy + 7) - ld de, (iy + 4) + ld bc, (iy + 9) + ld a, c + or a, b + ld de, (iy + 3) + jr z, .L.maybe_zero +.L.not_zero: + ld hl, (iy + 6) sbc hl, de - ; src <= dst - jr c, .L.copy_backwards - ; src > dst -; .copy_forwards: - add hl, de - inc hl - ldir -.L.zero: - ld hl, (iy + 4) - ret - -.L.copy_backwards: + ; src >= dst + jr nc, .L.copy_forwards + ; src < dst +; .L.copy_backwards: ; move HL and DE to the end + dec de ; DE = dst - 1 + ex de, hl + add hl, bc ; HL = dst + size - 1, DE = src - dst ex de, hl - add hl, bc - ex de, hl ; HL = src - dst - 1, DE = dst + size - add hl, de ; HL = src + size - 1 - dec de ; DE = dst + size - 1 + add hl, de ; HL = src + size - 1, DE = dst + size - 1 lddr ex de, hl inc hl ret +.L.copy_forwards: + add hl, de + ldir + ld hl, (iy + 3) + ret + +.L.maybe_zero: + ; low 16 bits are zero + or a, (iy + 11) ; test upper 8 bits + jr nz, .L.not_zero ; size >= 65536 + ; size == 0 + ex de, hl + ret + .else ; Optimized for when src == dst _memmove: - ; src > dst | LDIR | 33F + 15R + 2 - ; src < dst | LDDR | 36F + 12R + 2 - ; src = dst | | 29F + 12R + 2 - ; zero size | | 24F + 12R + 2 + ; src > dst | LDIR | 31F + 15R + 2 + ; src < dst | LDDR | 34F + 12R + 2 + ; src = dst | | 27F + 12R + 2 + ; zero size | | 26F + 10R + 2 + ; size >= 65536 + 7F + 1R + 2 (only when the low 16 bits are zero) - ld iy, -1 + ld iy, 0 add iy, sp - ld bc, (iy + 10) - sbc hl, hl - add hl, bc - jr nc, .L.zero - ld de, (iy + 4) - ld hl, (iy + 7) - or a, a + ld bc, (iy + 9) + ld a, c + or a, b + ld de, (iy + 3) + jr z, .L.maybe_zero +.L.not_zero: + ld hl, (iy + 6) sbc hl, de ; src < dst - jr c, .copy_backwards + jr c, .L.copy_backwards ; src >= dst ; .L.copy_forwards: add hl, de ; src == dst - ret z ; skips LDIR when src == dst + ret z ; skips LDIR when src == dst ; src > dst ldir -.L.zero: - ld hl, (iy + 4) + ld hl, (iy + 3) ret .L.copy_backwards: ; move HL and DE to the end - dec de ; DE = dst - 1 + dec de ; DE = dst - 1 ex de, hl - add hl, bc ; HL = dst + size - 1, DE = src - dst + add hl, bc ; HL = dst + size - 1, DE = src - dst ex de, hl - add hl, de ; HL = src + size - 1, DE = dst + size - 1 + add hl, de ; HL = src + size - 1, DE = dst + size - 1 lddr ex de, hl inc hl ret +.L.maybe_zero: + ; low 16 bits are zero + or a, (iy + 11) ; test upper 8 bits + jr nz, .L.not_zero ; size >= 65536 + ; size == 0 + ex de, hl + ret + .endif .endif From f29059013bcd6e40eae6f82ca6922014ab48f10e Mon Sep 17 00:00:00 2001 From: zerico <71151164+ZERICO2005@users.noreply.github.com> Date: Mon, 5 Jan 2026 16:15:13 -0700 Subject: [PATCH 3/4] added memcpy/memmove tests for non-zero multiples of 65536 --- .../asprintf_fprintf/src/fill_mem32.s | 26 +++++++++ test/standalone/asprintf_fprintf/src/main.c | 54 +++++++++++++++++++ tools/convbin | 2 +- 3 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 test/standalone/asprintf_fprintf/src/fill_mem32.s diff --git a/test/standalone/asprintf_fprintf/src/fill_mem32.s b/test/standalone/asprintf_fprintf/src/fill_mem32.s new file mode 100644 index 000000000..a53442d10 --- /dev/null +++ b/test/standalone/asprintf_fprintf/src/fill_mem32.s @@ -0,0 +1,26 @@ + .assume adl = 1 + + .section .text + + .global _fill_mem32 + +; void fill_mem32(void *dst, size_t bytes, uint32_t pattern) +_fill_mem32: + ld iy, 0 + add iy, sp + ld de, (iy + 3) + ld hl, (iy + 6) + ld bc, 4 + sbc hl, bc + ; return if bytes <= pattern_size + ret c + ret z + push hl + ; copy pattern once + lea hl, iy + 9 + ldir + pop bc + ; now copy (bytes - pattern_size) + ld hl, (iy + 3) + ldir + ret diff --git a/test/standalone/asprintf_fprintf/src/main.c b/test/standalone/asprintf_fprintf/src/main.c index 2abdc3adc..3c397d36e 100644 --- a/test/standalone/asprintf_fprintf/src/main.c +++ b/test/standalone/asprintf_fprintf/src/main.c @@ -114,6 +114,9 @@ void *T_memmove(void *dest, const void *src, size_t n) void *T_mempcpy(void *__restrict dest, const void *__restrict src, size_t n) __attribute__((nonnull(1, 2))); +void *T_memchr(const void *s, int c, size_t n) + __attribute__((nonnull(1))); + void *T_memrchr(const void *s, int c, size_t n) __attribute__((nonnull(1))); @@ -187,6 +190,7 @@ char *T_strtok_r(char *__restrict s, const char *__restrict delim, char **__rest #define T_memmem memmem #define T_memmove memmove #define T_mempcpy mempcpy +#define T_memchr memchr #define T_memrchr memrchr #define T_memrmem memrmem #define T_memset memset @@ -1465,6 +1469,53 @@ int strtok_test(void) { C(T_memcmp(str, truth_str, sizeof(truth_str)) == 0); } + return 0; +} + +int mem65536_test(void) { + void fill_mem32(void *dst, size_t bytes, uint32_t pattern); + + uint8_t * const dst = (uint8_t*)0xD40000; + const size_t screen_size = 320 * 240 * 2; + memset(dst, 0, screen_size); + const size_t B16 = 65536; + const size_t B17 = 131072; + + /* test return values */ + + C(T_memcpy(SINK, SINK, B16) == SINK); + C(T_memcpy(SINK, SINK, B17) == SINK); + + C(T_memmove(SINK, SINK, B16) == SINK); + C(T_memmove(SINK, SINK, B17) == SINK); + + C(T_memmove(SINK + 16, SINK, B16) == SINK + 16); + C(T_memmove(SINK + 16, SINK, B17) == SINK + 16); + + C(T_memmove(SINK, SINK + 16, B16) == SINK); + C(T_memmove(SINK, SINK + 16, B17) == SINK); + + /* test memcpy and memmove when size is a non-zero multiple of 65536 */ + + fill_mem32(dst + screen_size - B16, B16, 0x78563412); + C(T_memcpy(dst + 32, dst + screen_size - B16, B16) == dst + 32); + C(T_memchr(dst, 0x00, 32) == dst); + C(T_memchr(dst, 0x12, 32) == NULL_ptr); + C(T_memchr(dst, 0x12, 33) == dst + 32); + C(T_memrchr(dst, 0x78, 32 + B16 + 32) == dst + 32 + B16 - 1); + const uint32_t pattern_1 = 0xA3A0A1A0; + const uint32_t pattern_2 = 0xFECDAB89; + fill_mem32(dst, 32, pattern_1); + fill_mem32(dst + 24576, B16, pattern_2); + + C(T_memmove(dst + 61, dst, B16) == dst + 61); + C(T_memmem(dst, B17, &pattern_1, sizeof(pattern_1)) == dst); + C(T_memrmem(dst, B17, &pattern_1, sizeof(pattern_1)) == dst + 61 - 4 + 32); + C(T_memmove(dst + 24578, dst, B16) == dst + 24578); + C(T_memmem(dst, B16, &pattern_1, sizeof(pattern_1)) == dst + 0); + C(T_memrmem(dst, B16, &pattern_1, sizeof(pattern_1)) == dst + 24578 + 61 + 32 - 4); + C(T_memmem(dst, B16, &pattern_2, sizeof(pattern_2)) == dst + 24576 + 24578 + 61); + C(T_memrmem(dst, B16, &pattern_2, sizeof(pattern_2)) == dst + B16 - 4u - (((24578u - 24576u) - 61u) % 4u)); return 0; } @@ -1509,6 +1560,9 @@ int run_tests(void) { TEST(strchrnul_test()); TEST(strtok_test()); + TEST(mem65536_test()); + os_ClrHome(); + return 0; } diff --git a/tools/convbin b/tools/convbin index 6875d266a..da433cc62 160000 --- a/tools/convbin +++ b/tools/convbin @@ -1 +1 @@ -Subproject commit 6875d266ae52d073a30dbeb5db2d5e19ab59a8fd +Subproject commit da433cc629af31ccd7a211b868bf237e39dd6e78 From d56bedb787808d766ad7f535e0eda645b4e0d1b1 Mon Sep 17 00:00:00 2001 From: zerico <71151164+ZERICO2005@users.noreply.github.com> Date: Tue, 20 Jan 2026 14:27:14 -0700 Subject: [PATCH 4/4] made toolchain memcpy/memmove the default --- src/libc/memcpy.src | 12 +++--------- src/libc/memmove.src | 9 ++------- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/src/libc/memcpy.src b/src/libc/memcpy.src index 8ff480ff5..f66f435e4 100644 --- a/src/libc/memcpy.src +++ b/src/libc/memcpy.src @@ -5,17 +5,13 @@ .global _memcpy .type _memcpy, @function -.ifdef PREFER_OS_LIBC - - .set _memcpy, 0x0000A4 - -.else + ; Note: TiOS memcpy works fine, but our implementation is faster + ; .set _memcpy, 0x0000A4 _memcpy: ; size > 0 : 25F + 15R + 1 + LDIR - ; size >= 65536 : 32F + 16R + 3 + LDIR + ; size >= 65536 : 32F + 16R + 3 + LDIR (only when the low 16 bits are zero) ; size == 0 : 26F + 13R + 2 - ; size >= 65536 + 7F + 1R + 2 (only when the low 16 bits are zero) ld iy, 0 add iy, sp @@ -37,5 +33,3 @@ _memcpy: ; size == 0 ex de, hl ret - -.endif diff --git a/src/libc/memmove.src b/src/libc/memmove.src index bb8be82f3..b18f3a998 100644 --- a/src/libc/memmove.src +++ b/src/libc/memmove.src @@ -5,11 +5,8 @@ .global _memmove .type _memmove, @function -.ifdef PREFER_OS_LIBC - - .set _memmove, 0x0000A8 - -.else + ; Note: TiOS memmove works fine, but our implementation is faster + ; .set _memmove, 0x0000A8 .if 1 @@ -113,5 +110,3 @@ _memmove: ret .endif - -.endif